From 33d00e9c1d586f100ebcddfbba8b2664876d0c59 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Sat, 16 Sep 2023 05:20:10 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 65030 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 65425 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..176cf34e --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-09-08T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.02363v2","updated":"2023-09-08T17:51:51Z","published":"2023-05-03T18:01:13Z","title":"Entity Tracking in Language Models","summary":" Keeping track of how states of entities change as a text or dialog unfolds is\na key prerequisite to discourse understanding. Yet, there have been few\nsystematic investigations into the ability of large language models (LLMs) to\ntrack discourse entities. In this work, we present a task probing to what\nextent a language model can infer the final state of an entity given an English\ndescription of the initial state and a series of state-changing operations. We\nuse this task to first investigate whether Flan-T5, GPT-3 and GPT-3.5 can track\nthe state of entities, and find that only GPT-3.5 models, which have been\npretrained on large amounts of code, exhibit this ability. We then investigate\nwhether smaller models pretrained primarily on text can learn to track\nentities, through finetuning T5 on several training/evaluation splits. While\nperformance degrades for more complex splits, we find that even when evaluated\non a different set of entities from training or longer operation sequences, a\nfinetuned model can perform non-trivial entity tracking. Taken together, these\nresults suggest that language models can learn to track entities but\npretraining on text corpora alone does not make this capacity surface.\n","authors":["Najoung Kim","Sebastian Schuster"],"pdf_url":"https://arxiv.org/pdf/2305.02363v2.pdf","comment":"ACL 2023 Camera-ready"},{"id":"http://arxiv.org/abs/2309.04461v1","updated":"2023-09-08T17:49:44Z","published":"2023-09-08T17:49:44Z","title":"Measuring and Improving Chain-of-Thought Reasoning in Vision-Language\n Models","summary":" Vision-language models (VLMs) have recently demonstrated strong efficacy as\nvisual assistants that can parse natural queries about the visual content and\ngenerate human-like outputs. In this work, we explore the ability of these\nmodels to demonstrate human-like reasoning based on the perceived information.\nTo address a crucial concern regarding the extent to which their reasoning\ncapabilities are fully consistent and grounded, we also measure the reasoning\nconsistency of these models. We achieve this by proposing a chain-of-thought\n(CoT) based consistency measure. However, such an evaluation requires a\nbenchmark that encompasses both high-level inference and detailed reasoning\nchains, which is costly. We tackle this challenge by proposing a\nLLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously\nensuring the generation of a high-quality dataset. Based on this pipeline and\nthe existing coarse-grained annotated dataset, we build the CURE benchmark to\nmeasure both the zero-shot reasoning performance and consistency of VLMs. We\nevaluate existing state-of-the-art VLMs, and find that even the best-performing\nmodel is unable to demonstrate strong visual reasoning capabilities and\nconsistency, indicating that substantial efforts are required to enable VLMs to\nperform visual reasoning as systematically and consistently as humans. As an\nearly step, we propose a two-stage training framework aimed at improving both\nthe reasoning performance and consistency of VLMs. The first stage involves\nemploying supervised fine-tuning of VLMs using step-by-step reasoning samples\nautomatically generated by LLMs. In the second stage, we further augment the\ntraining process by incorporating feedback provided by LLMs to produce\nreasoning chains that are highly consistent and grounded. We empirically\nhighlight the effectiveness of our framework in both reasoning performance and\nconsistency.\n","authors":["Yangyi Chen","Karan Sikka","Michael Cogswell","Heng Ji","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2309.04461v1.pdf","comment":"The data is released at\n \\url{https://github.com/Yangyi-Chen/CoTConsistency}"},{"id":"http://arxiv.org/abs/2306.02074v2","updated":"2023-09-08T17:15:37Z","published":"2023-06-03T10:35:04Z","title":"A Conditional Generative Chatbot using Transformer Model","summary":" A Chatbot serves as a communication tool between a human user and a machine\nto achieve an appropriate answer based on the human input. In more recent\napproaches, a combination of Natural Language Processing and sequential models\nare used to build a generative Chatbot. The main challenge of these models is\ntheir sequential nature, which leads to less accurate results. To tackle this\nchallenge, in this paper, a novel architecture is proposed using conditional\nWasserstein Generative Adversarial Networks and a transformer model for answer\ngeneration in Chatbots. While the generator of the proposed model consists of a\nfull transformer model to generate an answer, the discriminator includes only\nthe encoder part of a transformer model followed by a classifier. To the best\nof our knowledge, this is the first time that a generative Chatbot is proposed\nusing the embedded transformer in both generator and discriminator models.\nRelying on the parallel computing of the transformer model, the results of the\nproposed model on the Cornell Movie-Dialog corpus and the Chit-Chat datasets\nconfirm the superiority of the proposed model compared to state-of-the-art\nalternatives using different evaluation metrics.\n","authors":["Nura Esfandiari","Kourosh Kiani","Razieh Rastgoo"],"pdf_url":"https://arxiv.org/pdf/2306.02074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00359v2","updated":"2023-09-08T16:18:53Z","published":"2023-09-01T09:34:49Z","title":"Large Content And Behavior Models To Understand, Simulate, And Optimize\n Content And Behavior","summary":" Shannon, in his seminal paper introducing information theory, divided the\ncommunication into three levels: technical, semantic, and effectivenss. While\nthe technical level is concerned with accurate reconstruction of transmitted\nsymbols, the semantic and effectiveness levels deal with the inferred meaning\nand its effect on the receiver. Thanks to telecommunications, the first level\nproblem has produced great advances like the internet. Large Language Models\n(LLMs) make some progress towards the second goal, but the third level still\nremains largely untouched. The third problem deals with predicting and\noptimizing communication for desired receiver behavior. LLMs, while showing\nwide generalization capabilities across a wide range of tasks, are unable to\nsolve for this. One reason for the underperformance could be a lack of\n\"behavior tokens\" in LLMs' training corpora. Behavior tokens define receiver\nbehavior over a communication, such as shares, likes, clicks, purchases,\nretweets, etc. While preprocessing data for LLM training, behavior tokens are\noften removed from the corpora as noise. Therefore, in this paper, we make some\ninitial progress towards reintroducing behavior tokens in LLM training. The\ntrained models, other than showing similar performance to LLMs on content\nunderstanding tasks, show generalization capabilities on behavior simulation,\ncontent simulation, behavior understanding, and behavior domain adaptation.\nUsing a wide range of tasks on two corpora, we show results on all these\ncapabilities. We call these models Large Content and Behavior Models (LCBMs).\nFurther, to spur more research on LCBMs, we release our new Content Behavior\nCorpus (CBC), a repository containing communicator, message, and corresponding\nreceiver behavior.\n","authors":["Ashmit Khandelwal","Aditya Agrawal","Aanisha Bhattacharyya","Yaman K Singla","Somesh Singh","Uttaran Bhattacharya","Ishita Dasgupta","Stefano Petrangeli","Rajiv Ratn Shah","Changyou Chen","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03882v2","updated":"2023-09-08T15:54:56Z","published":"2023-09-07T17:44:56Z","title":"On Large Language Models' Selection Bias in Multi-Choice Questions","summary":" Multi-choice questions (MCQs) serve as a common yet important task format in\nthe research of large language models (LLMs). Our work shows that LLMs exhibit\nan inherent \"selection bias\" in MCQs, which refers to LLMs' preferences to\nselect options located at specific positions (like \"Option C\"). This bias is\nprevalent across various LLMs, making their performance vulnerable to option\nposition changes in MCQs. We identify that one primary cause resulting in\nselection bias is option numbering, i.e., the ID symbols A/B/C/D associated\nwith the options. To mitigate selection bias, we propose a new method called\nPriDe. PriDe first decomposes the observed model prediction distribution into\nan intrinsic prediction over option contents and a prior distribution over\noption IDs. It then estimates the prior by permutating option contents on a\nsmall number of test samples, which is used to debias the subsequent test\nsamples. We demonstrate that, as a label-free, inference-time method, PriDe\nachieves a more effective and computation-efficient debiasing than strong\nbaselines. We further show that the priors estimated by PriDe generalize well\nacross different domains, highlighting its practical potential in broader\nscenarios.\n","authors":["Chujie Zheng","Hao Zhou","Fandong Meng","Jie Zhou","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2309.03882v2.pdf","comment":"Work in progress. 21 pages, 13 figures"},{"id":"http://arxiv.org/abs/2309.04389v1","updated":"2023-09-08T15:40:54Z","published":"2023-09-08T15:40:54Z","title":"CSPRD: A Financial Policy Retrieval Dataset for Chinese Stock Market","summary":" In recent years, great advances in pre-trained language models (PLMs) have\nsparked considerable research focus and achieved promising performance on the\napproach of dense passage retrieval, which aims at retrieving relative passages\nfrom massive corpus with given questions. However, most of existing datasets\nmainly benchmark the models with factoid queries of general commonsense, while\nspecialised fields such as finance and economics remain unexplored due to the\ndeficiency of large-scale and high-quality datasets with expert annotations. In\nthis work, we propose a new task, policy retrieval, by introducing the Chinese\nStock Policy Retrieval Dataset (CSPRD), which provides 700+ prospectus passages\nlabeled by experienced experts with relevant articles from 10k+ entries in our\ncollected Chinese policy corpus. Experiments on lexical, embedding and\nfine-tuned bi-encoder models show the effectiveness of our proposed CSPRD yet\nalso suggests ample potential for improvement. Our best performing baseline\nachieves 56.1% MRR@10, 28.5% NDCG@10, 37.5% Recall@10 and 80.6% Precision@10 on\ndev set.\n","authors":["Jinyuan Wang","Hai Zhao","Zhong Wang","Zeyang Zhu","Jinhao Xie","Yong Yu","Yongjian Fei","Yue Huang","Dawei Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.04389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04372v1","updated":"2023-09-08T15:06:05Z","published":"2023-09-08T15:06:05Z","title":"MoEController: Instruction-based Arbitrary Image Manipulation with\n Mixture-of-Expert Controllers","summary":" Diffusion-model-based text-guided image generation has recently made\nastounding progress, producing fascinating results in open-domain image\nmanipulation tasks. Few models, however, currently have complete zero-shot\ncapabilities for both global and local image editing due to the complexity and\ndiversity of image manipulation tasks. In this work, we propose a method with a\nmixture-of-expert (MOE) controllers to align the text-guided capacity of\ndiffusion models with different kinds of human instructions, enabling our model\nto handle various open-domain image manipulation tasks with natural language\ninstructions. First, we use large language models (ChatGPT) and conditional\nimage synthesis models (ControlNet) to generate a large number of global image\ntransfer dataset in addition to the instruction-based local image editing\ndataset. Then, using an MOE technique and task-specific adaptation training on\na large-scale dataset, our conditional diffusion model can edit images globally\nand locally. Extensive experiments demonstrate that our approach performs\nsurprisingly well on various image manipulation tasks when dealing with\nopen-domain images and arbitrary human instructions. Please refer to our\nproject page: [https://oppo-mente-lab.github.io/moe_controller/]\n","authors":["Sijia Li","Chen Chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2309.04372v1.pdf","comment":"5 pages,6 figures"},{"id":"http://arxiv.org/abs/2309.04369v1","updated":"2023-09-08T15:00:41Z","published":"2023-09-08T15:00:41Z","title":"Beyond Static Datasets: A Deep Interaction Approach to LLM Evaluation","summary":" Large Language Models (LLMs) have made progress in various real-world tasks,\nwhich stimulates requirements for the evaluation of LLMs. Existing LLM\nevaluation methods are mainly supervised signal-based which depends on static\ndatasets and cannot evaluate the ability of LLMs in dynamic real-world\nscenarios where deep interaction widely exists. Other LLM evaluation methods\nare human-based which are costly and time-consuming and are incapable of\nlarge-scale evaluation of LLMs. To address the issues above, we propose a novel\nDeep Interaction-based LLM-evaluation framework. In our proposed framework,\nLLMs' performances in real-world domains can be evaluated from their deep\ninteraction with other LLMs in elaborately designed evaluation tasks.\nFurthermore, our proposed framework is a general evaluation method that can be\napplied to a host of real-world tasks such as machine translation and code\ngeneration. We demonstrate the effectiveness of our proposed method through\nextensive experiments on four elaborately designed evaluation tasks.\n","authors":["Jiatong Li","Rui Li","Qi Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04333v1","updated":"2023-09-08T14:00:29Z","published":"2023-09-08T14:00:29Z","title":"Encoding Multi-Domain Scientific Papers by Ensembling Multiple CLS\n Tokens","summary":" Many useful tasks on scientific documents, such as topic classification and\ncitation prediction, involve corpora that span multiple scientific domains.\nTypically, such tasks are accomplished by representing the text with a vector\nembedding obtained from a Transformer's single CLS token. In this paper, we\nargue that using multiple CLS tokens could make a Transformer better specialize\nto multiple scientific domains. We present Multi2SPE: it encourages each of\nmultiple CLS tokens to learn diverse ways of aggregating token embeddings, then\nsums them up together to create a single vector representation. We also propose\nour new multi-domain benchmark, Multi-SciDocs, to test scientific paper vector\nencoders under multi-domain settings. We show that Multi2SPE reduces error by\nup to 25 percent in multi-domain citation prediction, while requiring only a\nnegligible amount of computation in addition to one BERT forward pass.\n","authors":["Ronald Seoh","Haw-Shiuan Chang","Andrew McCallum"],"pdf_url":"https://arxiv.org/pdf/2309.04333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02863v3","updated":"2023-09-08T13:58:45Z","published":"2023-07-06T09:03:10Z","title":"ValiTex -- a unified validation framework for computational text-based\n measures of social science constructs","summary":" Guidance on how to validate computational text-based measures of social\nscience constructs is fragmented. Although scholars generally acknowledge the\nimportance of validating their text-based measures, they often lack common\nterminology and a unified framework to do so. This paper introduces ValiTex, a\nnew validation framework designed to assist scholars in validly measuring\nsocial science constructs based on textual data. The framework draws on a\nlong-established validity concept in psychometrics but extends these concepts\nto cover the specific needs of computational text analysis. ValiTex consists of\ntwo components, a conceptual framework and a dynamic checklist. Whereas the\nconceptual framework provides a general structure along distinct phases on how\nto approach validation, the dynamic checklist defines specific validation steps\nand provides guidance on which steps might be considered recommendable (i.e.,\nproviding relevant and necessary validation evidence) or optional (i.e., useful\nfor providing additional supporting validation evidence). We demonstrate the\nutility of the framework by applying it to a use case of detecting sexism from\nsocial media data\n","authors":["Lukas Birkenmaier","Claudia Wagner","Clemens Lechner"],"pdf_url":"https://arxiv.org/pdf/2307.02863v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04292v1","updated":"2023-09-08T12:26:01Z","published":"2023-09-08T12:26:01Z","title":"Fuzzy Fingerprinting Transformer Language-Models for Emotion Recognition\n in Conversations","summary":" Fuzzy Fingerprints have been successfully used as an interpretable text\nclassification technique, but, like most other techniques, have been largely\nsurpassed in performance by Large Pre-trained Language Models, such as BERT or\nRoBERTa. These models deliver state-of-the-art results in several Natural\nLanguage Processing tasks, namely Emotion Recognition in Conversations (ERC),\nbut suffer from the lack of interpretability and explainability. In this paper,\nwe propose to combine the two approaches to perform ERC, as a means to obtain\nsimpler and more interpretable Large Language Models-based classifiers. We\npropose to feed the utterances and their previous conversational turns to a\npre-trained RoBERTa, obtaining contextual embedding utterance representations,\nthat are then supplied to an adapted Fuzzy Fingerprint classification module.\nWe validate our approach on the widely used DailyDialog ERC benchmark dataset,\nin which we obtain state-of-the-art level results using a much lighter model.\n","authors":["Patrícia Pereira","Rui Ribeiro","Helena Moniz","Luisa Coheur","Joao Paulo Carvalho"],"pdf_url":"https://arxiv.org/pdf/2309.04292v1.pdf","comment":"FUZZ-IEEE 2023"},{"id":"http://arxiv.org/abs/2309.04269v1","updated":"2023-09-08T11:31:08Z","published":"2023-09-08T11:31:08Z","title":"From Sparse to Dense: GPT-4 Summarization with Chain of Density\n Prompting","summary":" Selecting the ``right'' amount of information to include in a summary is a\ndifficult task. A good summary should be detailed and entity-centric without\nbeing overly dense and hard to follow. To better understand this tradeoff, we\nsolicit increasingly dense GPT-4 summaries with what we refer to as a ``Chain\nof Density'' (CoD) prompt. Specifically, GPT-4 generates an initial\nentity-sparse summary before iteratively incorporating missing salient entities\nwithout increasing the length. Summaries generated by CoD are more abstractive,\nexhibit more fusion, and have less of a lead bias than GPT-4 summaries\ngenerated by a vanilla prompt. We conduct a human preference study on 100 CNN\nDailyMail articles and find that that humans prefer GPT-4 summaries that are\nmore dense than those generated by a vanilla prompt and almost as dense as\nhuman written summaries. Qualitative analysis supports the notion that there\nexists a tradeoff between informativeness and readability. 500 annotated CoD\nsummaries, as well as an extra 5,000 unannotated summaries, are freely\navailable on HuggingFace\n(https://huggingface.co/datasets/griffin/chain_of_density).\n","authors":["Griffin Adams","Alexander Fabbri","Faisal Ladhak","Eric Lehman","Noémie Elhadad"],"pdf_url":"https://arxiv.org/pdf/2309.04269v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2308.16797v2","updated":"2023-09-08T11:24:06Z","published":"2023-08-31T15:19:28Z","title":"Simple LLM Prompting is State-of-the-Art for Robust and Multilingual\n Dialogue Evaluation","summary":" Despite significant research effort in the development of automatic dialogue\nevaluation metrics, little thought is given to evaluating dialogues other than\nin English. At the same time, ensuring metrics are invariant to semantically\nsimilar responses is also an overlooked topic. In order to achieve the desired\nproperties of robustness and multilinguality for dialogue evaluation metrics,\nwe propose a novel framework that takes advantage of the strengths of current\nevaluation models with the newly-established paradigm of prompting Large\nLanguage Models (LLMs). Empirical results show our framework achieves state of\nthe art results in terms of mean Spearman correlation scores across several\nbenchmarks and ranks first place on both the Robust and Multilingual tasks of\nthe DSTC11 Track 4 \"Automatic Evaluation Metrics for Open-Domain Dialogue\nSystems\", proving the evaluation capabilities of prompted LLMs.\n","authors":["John Mendonça","Patrícia Pereira","Helena Moniz","João Paulo Carvalho","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2308.16797v2.pdf","comment":"DSTC11 best paper for Track 4"},{"id":"http://arxiv.org/abs/2308.15363v2","updated":"2023-09-08T10:13:16Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborate their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar. To\nexplore the potential of open-source LLM, we investigate them in various\nscenarios, and further enhance their performance with supervised fine-tuning.\nOur explorations highlight open-source LLMs' potential in Text-to-SQL, as well\nas the advantages and disadvantages of the supervised fine-tuning.\nAdditionally, towards an efficient and economic LLM-based Text-to-SQL solution,\nwe emphasize the token efficiency in prompt engineering and compare the prior\nstudies under this metric. We hope that our work provides a deeper\nunderstanding of Text-to-SQL with LLMs, and inspires further investigations and\nbroad applications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v2.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2301.05880v3","updated":"2023-09-08T10:03:16Z","published":"2023-01-14T10:18:22Z","title":"TikTalk: A Video-Based Dialogue Dataset for Multi-Modal Chitchat in Real\n World","summary":" To facilitate the research on intelligent and human-like chatbots with\nmulti-modal context, we introduce a new video-based multi-modal dialogue\ndataset, called TikTalk. We collect 38K videos from a popular video-sharing\nplatform, along with 367K conversations posted by users beneath them. Users\nengage in spontaneous conversations based on their multi-modal experiences from\nwatching videos, which helps recreate real-world chitchat context. Compared to\nprevious multi-modal dialogue datasets, the richer context types in TikTalk\nlead to more diverse conversations, but also increase the difficulty in\ncapturing human interests from intricate multi-modal information to generate\npersonalized responses. Moreover, external knowledge is more frequently evoked\nin our dataset. These facts reveal new challenges for multi-modal dialogue\nmodels. We quantitatively demonstrate the characteristics of TikTalk, propose a\nvideo-based multi-modal chitchat task, and evaluate several dialogue baselines.\nExperimental results indicate that the models incorporating large language\nmodels (LLM) can generate more diverse responses, while the model utilizing\nknowledge graphs to introduce external knowledge performs the best overall.\nFurthermore, no existing model can solve all the above challenges well. There\nis still a large room for future improvements, even for LLM with visual\nextensions. Our dataset is available at\n\\url{https://ruc-aimind.github.io/projects/TikTalk/}.\n","authors":["Hongpeng Lin","Ludan Ruan","Wenke Xia","Peiyu Liu","Jingyuan Wen","Yixin Xu","Di Hu","Ruihua Song","Wayne Xin Zhao","Qin Jin","Zhiwu Lu"],"pdf_url":"https://arxiv.org/pdf/2301.05880v3.pdf","comment":"Accepted to ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2204.08975v2","updated":"2023-09-08T09:11:02Z","published":"2022-04-19T16:23:07Z","title":"Detecting Text Formality: A Study of Text Classification Approaches","summary":" Formality is one of the important characteristics of text documents. The\nautomatic detection of the formality level of a text is potentially beneficial\nfor various natural language processing tasks. Before, two large-scale datasets\nwere introduced for multiple languages featuring formality annotation -- GYAFC\nand X-FORMAL. However, they were primarily used for the training of style\ntransfer models. At the same time, the detection of text formality on its own\nmay also be a useful application. This work proposes the first to our knowledge\nsystematic study of formality detection methods based on statistical,\nneural-based, and Transformer-based machine learning methods and delivers the\nbest-performing models for public usage. We conducted three types of\nexperiments -- monolingual, multilingual, and cross-lingual. The study shows\nthe overcome of Char BiLSTM model over Transformer-based ones for the\nmonolingual and multilingual formality classification task, while\nTransformer-based classifiers are more stable to cross-lingual knowledge\ntransfer.\n","authors":["Daryna Dementieva","Nikolay Babakov","Alexander Panchenko"],"pdf_url":"https://arxiv.org/pdf/2204.08975v2.pdf","comment":"Published at RANLP2023"},{"id":"http://arxiv.org/abs/2309.04213v1","updated":"2023-09-08T08:54:55Z","published":"2023-09-08T08:54:55Z","title":"UQ at #SMM4H 2023: ALEX for Public Health Analysis with Social Media","summary":" As social media becomes increasingly popular, more and more activities\nrelated to public health emerge. Current techniques for public health analysis\ninvolve popular models such as BERT and large language models (LLMs). However,\nthe costs of training in-domain LLMs for public health are especially\nexpensive. Furthermore, such kinds of in-domain datasets from social media are\ngenerally imbalanced. To tackle these challenges, the data imbalance issue can\nbe overcome by data augmentation and balanced training. Moreover, the ability\nof the LLMs can be effectively utilized by prompting the model properly. In\nthis paper, a novel ALEX framework is proposed to improve the performance of\npublic health analysis on social media by adopting an LLMs explanation\nmechanism. Results show that our ALEX model got the best performance among all\nsubmissions in both Task 2 and Task 4 with a high score in Task 1 in Social\nMedia Mining for Health 2023 (SMM4H)[1]. Our code has been released at https://\ngithub.com/YanJiangJerry/ALEX.\n","authors":["Yan Jiang","Ruihong Qiu","Yi Zhang","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2309.04213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04198v1","updated":"2023-09-08T08:20:46Z","published":"2023-09-08T08:20:46Z","title":"The CALLA Dataset: Probing LLMs' Interactive Knowledge Acquisition from\n Chinese Medical Literature","summary":" The application of Large Language Models (LLMs) to the medical domain has\nstimulated the interest of researchers. Recent studies have focused on\nconstructing Instruction Fine-Tuning (IFT) data through medical knowledge\ngraphs to enrich the interactive medical knowledge of LLMs. However, the\nmedical literature serving as a rich source of medical knowledge remains\nunexplored. Our work introduces the CALLA dataset to probe LLMs' interactive\nknowledge acquisition from Chinese medical literature. It assesses the\nproficiency of LLMs in mastering medical knowledge through a free-dialogue\nfact-checking task. We identify a phenomenon called the ``fact-following\nresponse``, where LLMs tend to affirm facts mentioned in questions and display\na reluctance to challenge them. To eliminate the inaccurate evaluation caused\nby this phenomenon, for the golden fact, we artificially construct test data\nfrom two perspectives: one consistent with the fact and one inconsistent with\nthe fact. Drawing from the probing experiment on the CALLA dataset, we conclude\nthat IFT data highly correlated with the medical literature corpus serves as a\npotent catalyst for LLMs, enabling themselves to skillfully employ the medical\nknowledge acquired during the pre-training phase within interactive scenarios,\nenhancing accuracy. Furthermore, we design a framework for automatically\nconstructing IFT data based on medical literature and discuss some real-world\napplications.\n","authors":["Yanrui Du","Sendong Zhao","Yuhan Chen","Rai Bai","Jing Liu","Hua Wu","Haifeng Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2309.04198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04175v1","updated":"2023-09-08T07:42:57Z","published":"2023-09-08T07:42:57Z","title":"Knowledge-tuning Large Language Models with Structured Medical Knowledge\n Bases for Reliable Response Generation in Chinese","summary":" Large Language Models (LLMs) have demonstrated remarkable success in diverse\nnatural language processing (NLP) tasks in general domains. However, LLMs\nsometimes generate responses with the hallucination about medical facts due to\nlimited domain knowledge. Such shortcomings pose potential risks in the\nutilization of LLMs within medical contexts. To address this challenge, we\npropose knowledge-tuning, which leverages structured medical knowledge bases\nfor the LLMs to grasp domain knowledge efficiently and facilitate reliable\nresponse generation. We also release cMedKnowQA, a Chinese medical knowledge\nquestion-answering dataset constructed from medical knowledge bases to assess\nthe medical knowledge proficiency of LLMs. Experimental results show that the\nLLMs which are knowledge-tuned with cMedKnowQA, can exhibit higher levels of\naccuracy in response generation compared with vanilla instruction-tuning and\noffer a new reliable way for the domain adaptation of LLMs.\n","authors":["Haochun Wang","Sendong Zhao","Zewen Qiang","Zijian Li","Nuwa Xi","Yanrui Du","MuZhen Cai","Haoqiang Guo","Yuhan Chen","Haoming Xu","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04175v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.04174v1","updated":"2023-09-08T07:42:29Z","published":"2023-09-08T07:42:29Z","title":"Manifold-based Verbalizer Space Re-embedding for Tuning-free\n Prompt-based Classification","summary":" Prompt-based classification adapts tasks to a cloze question format utilizing\nthe [MASK] token and the filled tokens are then mapped to labels through\npre-defined verbalizers. Recent studies have explored the use of verbalizer\nembeddings to reduce labor in this process. However, all existing studies\nrequire a tuning process for either the pre-trained models or additional\ntrainable embeddings. Meanwhile, the distance between high-dimensional\nverbalizer embeddings should not be measured by Euclidean distance due to the\npotential for non-linear manifolds in the representation space. In this study,\nwe propose a tuning-free manifold-based space re-embedding method called\nLocally Linear Embedding with Intra-class Neighborhood Constraint (LLE-INC) for\nverbalizer embeddings, which preserves local properties within the same class\nas guidance for classification. Experimental results indicate that even without\ntuning any parameters, our LLE-INC is on par with automated verbalizers with\nparameter tuning. And with the parameter updating, our approach further\nenhances prompt-based tuning by up to 3.2%. Furthermore, experiments with the\nLLaMA-7B&13B indicate that LLE-INC is an efficient tuning-free classification\napproach for the hyper-scale language models.\n","authors":["Haochun Wang","Sendong Zhao","Chi Liu","Nuwa Xi","Muzhen Cai","Bing Qin","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04174v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.04162v1","updated":"2023-09-08T07:10:57Z","published":"2023-09-08T07:10:57Z","title":"GLS-CSC: A Simple but Effective Strategy to Mitigate Chinese STM Models'\n Over-Reliance on Superficial Clue","summary":" Pre-trained models have achieved success in Chinese Short Text Matching (STM)\ntasks, but they often rely on superficial clues, leading to a lack of robust\npredictions. To address this issue, it is crucial to analyze and mitigate the\ninfluence of superficial clues on STM models. Our study aims to investigate\ntheir over-reliance on the edit distance feature, commonly used to measure the\nsemantic similarity of Chinese text pairs, which can be considered a\nsuperficial clue. To mitigate STM models' over-reliance on superficial clues,\nwe propose a novel resampling training strategy called Gradually Learn Samples\nContaining Superficial Clue (GLS-CSC). Through comprehensive evaluations of\nIn-Domain (I.D.), Robustness (Rob.), and Out-Of-Domain (O.O.D.) test sets, we\ndemonstrate that GLS-CSC outperforms existing methods in terms of enhancing the\nrobustness and generalization of Chinese STM models. Moreover, we conduct a\ndetailed analysis of existing methods and reveal their commonality.\n","authors":["Yanrui Du","Sendong Zhao","Yuhan Chen","Rai Bai","Jing Liu","Hua Wu","Haifeng Wang","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2309.04162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04156v1","updated":"2023-09-08T06:48:41Z","published":"2023-09-08T06:48:41Z","title":"Cross-Utterance Conditioned VAE for Speech Generation","summary":" Speech synthesis systems powered by neural networks hold promise for\nmultimedia production, but frequently face issues with producing expressive\nspeech and seamless editing. In response, we present the Cross-Utterance\nConditioned Variational Autoencoder speech synthesis (CUC-VAE S2) framework to\nenhance prosody and ensure natural speech generation. This framework leverages\nthe powerful representational capabilities of pre-trained language models and\nthe re-expression abilities of variational autoencoders (VAEs). The core\ncomponent of the CUC-VAE S2 framework is the cross-utterance CVAE, which\nextracts acoustic, speaker, and textual features from surrounding sentences to\ngenerate context-sensitive prosodic features, more accurately emulating human\nprosody generation. We further propose two practical algorithms tailored for\ndistinct speech synthesis applications: CUC-VAE TTS for text-to-speech and\nCUC-VAE SE for speech editing. The CUC-VAE TTS is a direct application of the\nframework, designed to generate audio with contextual prosody derived from\nsurrounding texts. On the other hand, the CUC-VAE SE algorithm leverages real\nmel spectrogram sampling conditioned on contextual information, producing audio\nthat closely mirrors real sound and thereby facilitating flexible speech\nediting based on text such as deletion, insertion, and replacement.\nExperimental results on the LibriTTS datasets demonstrate that our proposed\nmodels significantly enhance speech synthesis and editing, producing more\nnatural and expressive speech.\n","authors":["Yang Li","Cheng Yu","Guangzhi Sun","Weiqin Zu","Zheng Tian","Ying Wen","Wei Pan","Chao Zhang","Jun Wang","Yang Yang","Fanglei Sun"],"pdf_url":"https://arxiv.org/pdf/2309.04156v1.pdf","comment":"13 pages;"},{"id":"http://arxiv.org/abs/2309.04146v1","updated":"2023-09-08T06:23:25Z","published":"2023-09-08T06:23:25Z","title":"NESTLE: a No-Code Tool for Statistical Analysis of Legal Corpus","summary":" The statistical analysis of large scale legal corpus can provide valuable\nlegal insights. For such analysis one needs to (1) select a subset of the\ncorpus using document retrieval tools, (2) structuralize text using information\nextraction (IE) systems, and (3) visualize the data for the statistical\nanalysis. Each process demands either specialized tools or programming skills\nwhereas no comprehensive unified \"no-code\" tools have been available.\nEspecially for IE, if the target information is not predefined in the ontology\nof the IE system, one needs to build their own system. Here we provide NESTLE,\na no code tool for large-scale statistical analysis of legal corpus. With\nNESTLE, users can search target documents, extract information, and visualize\nthe structured data all via the chat interface with accompanying auxiliary GUI\nfor the fine-level control. NESTLE consists of three main components: a search\nengine, an end-to-end IE system, and a Large Language Model (LLM) that glues\nthe whole components together and provides the chat interface. Powered by LLM\nand the end-to-end IE system, NESTLE can extract any type of information that\nhas not been predefined in the IE system opening up the possibility of\nunlimited customizable statistical analysis of the corpus without writing a\nsingle line of code. The use of the custom end-to-end IE system also enables\nfaster and low-cost IE on large scale corpus. We validate our system on 15\nKorean precedent IE tasks and 3 legal text classification tasks from LEXGLUE.\nThe comprehensive experiments reveal NESTLE can achieve GPT-4 comparable\nperformance by training the internal IE module with 4 human-labeled, and 192\nLLM-labeled examples. The detailed analysis provides the insight on the\ntrade-off between accuracy, time, and cost in building such system.\n","authors":["Kyoungyeon Cho","Seungkum Han","Wonseok Hwang"],"pdf_url":"https://arxiv.org/pdf/2309.04146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04141v1","updated":"2023-09-08T05:50:27Z","published":"2023-09-08T05:50:27Z","title":"RST-style Discourse Parsing Guided by Document-level Content Structures","summary":" Rhetorical Structure Theory based Discourse Parsing (RST-DP) explores how\nclauses, sentences, and large text spans compose a whole discourse and presents\nthe rhetorical structure as a hierarchical tree. Existing RST parsing pipelines\nconstruct rhetorical structures without the knowledge of document-level content\nstructures, which causes relatively low performance when predicting the\ndiscourse relations for large text spans. Recognizing the value of high-level\ncontent-related information in facilitating discourse relation recognition, we\npropose a novel pipeline for RST-DP that incorporates structure-aware news\ncontent sentence representations derived from the task of News Discourse\nProfiling. By incorporating only a few additional layers, this enhanced\npipeline exhibits promising performance across various RST parsing metrics.\n","authors":["Ming Li","Ruihong Huang"],"pdf_url":"https://arxiv.org/pdf/2309.04141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09537v2","updated":"2023-09-08T05:37:35Z","published":"2022-10-18T02:07:09Z","title":"Less is More: A Lightweight and Robust Neural Architecture for Discourse\n Parsing","summary":" Complex feature extractors are widely employed for text representation\nbuilding. However, these complex feature extractors make the NLP systems prone\nto overfitting especially when the downstream training datasets are relatively\nsmall, which is the case for several discourse parsing tasks. Thus, we propose\nan alternative lightweight neural architecture that removes multiple complex\nfeature extractors and only utilizes learnable self-attention modules to\nindirectly exploit pretrained neural language models, in order to maximally\npreserve the generalizability of pre-trained language models. Experiments on\nthree common discourse parsing tasks show that powered by recent pretrained\nlanguage models, the lightweight architecture consisting of only two\nself-attention layers obtains much better generalizability and robustness.\nMeanwhile, it achieves comparable or even better system performance with fewer\nlearnable parameters and less processing time.\n","authors":["Ming Li","Ruihong Huang"],"pdf_url":"https://arxiv.org/pdf/2210.09537v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12032v2","updated":"2023-09-08T05:11:01Z","published":"2023-08-23T09:45:29Z","title":"From Quantity to Quality: Boosting LLM Performance with Self-Guided Data\n Selection for Instruction Tuning","summary":" In the realm of Large Language Models, the balance between instruction data\nquality and quantity has become a focal point. Recognizing this, we introduce a\nself-guided methodology for LLMs to autonomously discern and select cherry\nsamples from vast open-source datasets, effectively minimizing manual curation\nand potential cost for instruction tuning an LLM. Our key innovation, the\nInstruction-Following Difficulty (IFD) metric, emerges as a pivotal tool to\nidentify discrepancies between a model's expected responses and its autonomous\ngeneration prowess. Through the adept application of IFD, cherry samples are\npinpointed, leading to a marked uptick in model training efficiency. Empirical\nvalidations on renowned datasets like Alpaca and WizardLM underpin our\nfindings; with a mere 10% of conventional data input, our strategy showcases\nimproved results. This synthesis of self-guided cherry-picking and the IFD\nmetric signifies a transformative leap in the optimization of LLMs, promising\nboth efficiency and resource-conscious advancements.\n","authors":["Ming Li","Yong Zhang","Zhitao Li","Jiuhai Chen","Lichang Chen","Ning Cheng","Jianzong Wang","Tianyi Zhou","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2308.12032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04106v1","updated":"2023-09-08T03:58:05Z","published":"2023-09-08T03:58:05Z","title":"Meta predictive learning model of natural languages","summary":" Large language models based on self-attention mechanisms have achieved\nastonishing performances not only in natural language itself, but also in a\nvariety of tasks of different nature. However, regarding processing language,\nour human brain may not operate using the same principle. Then, a debate is\nestablished on the connection between brain computation and artificial\nself-supervision adopted in large language models. One of most influential\nhypothesis in brain computation is the predictive coding framework, which\nproposes to minimize the prediction error by local learning. However, the role\nof predictive coding and the associated credit assignment in language\nprocessing remains unknown. Here, we propose a mean-field learning model within\nthe predictive coding framework, assuming that the synaptic weight of each\nconnection follows a spike and slab distribution, and only the distribution is\ntrained. This meta predictive learning is successfully validated on classifying\nhandwritten digits where pixels are input to the network in sequence, and on\nthe toy and real language corpus. Our model reveals that most of the\nconnections become deterministic after learning, while the output connections\nhave a higher level of variability. The performance of the resulting network\nensemble changes continuously with data load, further improving with more\ntraining data, in analogy with the emergent behavior of large language models.\nTherefore, our model provides a starting point to investigate the physics and\nbiology correspondences of the language processing and the unexpected general\nintelligence.\n","authors":["Chan Li","Junbin Qiu","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2309.04106v1.pdf","comment":"23 pages, 6 figures, codes are available in the main text with the\n link"},{"id":"http://arxiv.org/abs/2309.03667v2","updated":"2023-09-08T03:46:33Z","published":"2023-09-07T12:10:47Z","title":"Exploring an LM to generate Prolog Predicates from Mathematics Questions","summary":" Recently, there has been a surge in interest in NLP driven by ChatGPT.\nChatGPT, a transformer-based generative language model of substantial scale,\nexhibits versatility in performing various tasks based on natural language.\nNevertheless, large language models often exhibit poor performance in solving\nmathematics questions that require reasoning. Prior research has demonstrated\nthe effectiveness of chain-of-thought prompting in enhancing reasoning\ncapabilities. Now, we aim to investigate whether fine-tuning a model for the\ngeneration of Prolog codes, a logic language, and subsequently passing these\ncodes to a compiler can further improve accuracy. Consequently, we employ\nchain-of-thought to fine-tune LLaMA7B as a baseline model and develop other\nfine-tuned LLaMA7B models for the generation of Prolog code, Prolog code +\nchain-of-thought, and chain-of-thought + Prolog code, respectively. The results\nreveal that the Prolog generation model surpasses the baseline in performance,\nwhile the combination generation models do not yield significant improvements.\nThe Prolog corpus based on GSM8K and the correspondingly finetuned Prolog\ngeneration model based on LLaMA7B are released to the research community.\n","authors":["Xiaocheng Yang","Yik-Cheung Tam"],"pdf_url":"https://arxiv.org/pdf/2309.03667v2.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.04087v1","updated":"2023-09-08T02:56:30Z","published":"2023-09-08T02:56:30Z","title":"Unsupervised Multi-document Summarization with Holistic Inference","summary":" Multi-document summarization aims to obtain core information from a\ncollection of documents written on the same topic. This paper proposes a new\nholistic framework for unsupervised multi-document extractive summarization.\nOur method incorporates the holistic beam search inference method associated\nwith the holistic measurements, named Subset Representative Index (SRI). SRI\nbalances the importance and diversity of a subset of sentences from the source\ndocuments and can be calculated in unsupervised and adaptive manners. To\ndemonstrate the effectiveness of our method, we conduct extensive experiments\non both small and large-scale multi-document summarization datasets under both\nunsupervised and adaptive settings. The proposed method outperforms strong\nbaselines by a significant margin, as indicated by the resulting ROUGE scores\nand diversity measures. Our findings also suggest that diversity is essential\nfor improving multi-document summary performance.\n","authors":["Haopeng Zhang","Sangwoo Cho","Kaiqiang Song","Xiaoyang Wang","Hongwei Wang","Jiawei Zhang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2309.04087v1.pdf","comment":"Findings of IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2308.15452v2","updated":"2023-09-08T02:31:35Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.03563v2","updated":"2023-09-08T01:00:52Z","published":"2023-09-07T08:50:45Z","title":"All Labels Together: Low-shot Intent Detection with an Efficient Label\n Semantic Encoding Paradigm","summary":" In intent detection tasks, leveraging meaningful semantic information from\nintent labels can be particularly beneficial for few-shot scenarios. However,\nexisting few-shot intent detection methods either ignore the intent labels,\n(e.g. treating intents as indices) or do not fully utilize this information\n(e.g. only using part of the intent labels). In this work, we present an\nend-to-end One-to-All system that enables the comparison of an input utterance\nwith all label candidates. The system can then fully utilize label semantics in\nthis way. Experiments on three few-shot intent detection tasks demonstrate that\nOne-to-All is especially effective when the training resource is extremely\nscarce, achieving state-of-the-art performance in 1-, 3- and 5-shot settings.\nMoreover, we present a novel pretraining strategy for our model that utilizes\nindirect supervision from paraphrasing, enabling zero-shot cross-domain\ngeneralization on intent detection tasks. Our code is at\nhttps://github.com/jiangshdd/AllLablesTogether.\n","authors":["Jiangshu Du","Congying Xia","Wenpeng Yin","Tingting Liang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2309.03563v2.pdf","comment":"Accepted by IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2309.04635v1","updated":"2023-09-08T23:12:03Z","published":"2023-09-08T23:12:03Z","title":"Can NLP Models 'Identify', 'Distinguish', and 'Justify' Questions that\n Don't have a Definitive Answer?","summary":" Though state-of-the-art (SOTA) NLP systems have achieved remarkable\nperformance on a variety of language understanding tasks, they primarily focus\non questions that have a correct and a definitive answer. However, in\nreal-world applications, users often ask questions that don't have a definitive\nanswer. Incorrectly answering such questions certainly hampers a system's\nreliability and trustworthiness. Can SOTA models accurately identify such\nquestions and provide a reasonable response?\n To investigate the above question, we introduce QnotA, a dataset consisting\nof five different categories of questions that don't have definitive answers.\nFurthermore, for each QnotA instance, we also provide a corresponding QA\ninstance i.e. an alternate question that ''can be'' answered. With this data,\nwe formulate three evaluation tasks that test a system's ability to 'identify',\n'distinguish', and 'justify' QnotA questions. Through comprehensive\nexperiments, we show that even SOTA models including GPT-3 and Flan T5 do not\nfare well on these tasks and lack considerably behind the human performance\nbaseline. We conduct a thorough analysis which further leads to several\ninteresting findings. Overall, we believe our work and findings will encourage\nand facilitate further research in this important area and help develop more\nrobust models.\n","authors":["Ayushi Agarwal","Nisarg Patel","Neeraj Varshney","Mihir Parmar","Pavan Mallina","Aryan Bhavin Shah","Srihari Raju Sangaraju","Tirth Patel","Nihar Thakkar","Chitta Baral"],"pdf_url":"https://arxiv.org/pdf/2309.04635v1.pdf","comment":"TrustNLP Workshop at ACL 2023"},{"id":"http://arxiv.org/abs/2305.15070v2","updated":"2023-09-08T22:17:17Z","published":"2023-05-24T11:54:46Z","title":"Annotation Imputation to Individualize Predictions: Initial Studies on\n Distribution Dynamics and Model Predictions","summary":" Annotating data via crowdsourcing is time-consuming and expensive. Due to\nthese costs, dataset creators often have each annotator label only a small\nsubset of the data. This leads to sparse datasets with examples that are marked\nby few annotators. The downside of this process is that if an annotator doesn't\nget to label a particular example, their perspective on it is missed. This is\nespecially concerning for subjective NLP datasets where there is no single\ncorrect label: people may have different valid opinions. Thus, we propose using\nimputation methods to generate the opinions of all annotators for all examples,\ncreating a dataset that does not leave out any annotator's view. We then train\nand prompt models, using data from the imputed dataset, to make predictions\nabout the distribution of responses and individual annotations.\n In our analysis of the results, we found that the choice of imputation method\nsignificantly impacts soft label changes and distribution. While the imputation\nintroduces noise in the prediction of the original dataset, it has shown\npotential in enhancing shots for prompts, particularly for low-response-rate\nannotators. We have made all of our code and data publicly available.\n","authors":["London Lowmanstone","Ruyuan Wan","Risako Owan","Jaehyung Kim","Dongyeop Kang"],"pdf_url":"https://arxiv.org/pdf/2305.15070v2.pdf","comment":"NLPerspectives 2023 Conference, 39 pages, 13 figures, 13 tables"},{"id":"http://arxiv.org/abs/2309.04607v1","updated":"2023-09-08T21:50:10Z","published":"2023-09-08T21:50:10Z","title":"Linking Symptom Inventories using Semantic Textual Similarity","summary":" An extensive library of symptom inventories has been developed over time to\nmeasure clinical symptoms, but this variety has led to several long standing\nissues. Most notably, results drawn from different settings and studies are not\ncomparable, which limits reproducibility. Here, we present an artificial\nintelligence (AI) approach using semantic textual similarity (STS) to link\nsymptoms and scores across previously incongruous symptom inventories. We\ntested the ability of four pre-trained STS models to screen thousands of\nsymptom description pairs for related content - a challenging task typically\nrequiring expert panels. Models were tasked to predict symptom severity across\nfour different inventories for 6,607 participants drawn from 16 international\ndata sources. The STS approach achieved 74.8% accuracy across five tasks,\noutperforming other models tested. This work suggests that incorporating\ncontextual, semantic information can assist expert decision-making processes,\nyielding gains for both general and disease-specific clinical assessment.\n","authors":["Eamonn Kennedy","Shashank Vadlamani","Hannah M Lindsey","Kelly S Peterson","Kristen Dams OConnor","Kenton Murray","Ronak Agarwal","Houshang H Amiri","Raeda K Andersen","Talin Babikian","David A Baron","Erin D Bigler","Karen Caeyenberghs","Lisa Delano-Wood","Seth G Disner","Ekaterina Dobryakova","Blessen C Eapen","Rachel M Edelstein","Carrie Esopenko","Helen M Genova","Elbert Geuze","Naomi J Goodrich-Hunsaker","Jordan Grafman","Asta K Haberg","Cooper B Hodges","Kristen R Hoskinson","Elizabeth S Hovenden","Andrei Irimia","Neda Jahanshad","Ruchira M Jha","Finian Keleher","Kimbra Kenney","Inga K Koerte","Spencer W Liebel","Abigail Livny","Marianne Lovstad","Sarah L Martindale","Jeffrey E Max","Andrew R Mayer","Timothy B Meier","Deleene S Menefee","Abdalla Z Mohamed","Stefania Mondello","Martin M Monti","Rajendra A Morey","Virginia Newcombe","Mary R Newsome","Alexander Olsen","Nicholas J Pastorek","Mary Jo Pugh","Adeel Razi","Jacob E Resch","Jared A Rowland","Kelly Russell","Nicholas P Ryan","Randall S Scheibel","Adam T Schmidt","Gershon Spitz","Jaclyn A Stephens","Assaf Tal","Leah D Talbert","Maria Carmela Tartaglia","Brian A Taylor","Sophia I Thomopoulos","Maya Troyanskaya","Eve M Valera","Harm Jan van der Horn","John D Van Horn","Ragini Verma","Benjamin SC Wade","Willian SC Walker","Ashley L Ware","J Kent Werner Jr","Keith Owen Yeates","Ross D Zafonte","Michael M Zeineh","Brandon Zielinski","Paul M Thompson","Frank G Hillary","David F Tate","Elisabeth A Wilde","Emily L Dennis"],"pdf_url":"https://arxiv.org/pdf/2309.04607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2005.12522v3","updated":"2023-09-08T21:44:52Z","published":"2020-05-26T05:41:58Z","title":"What Are People Asking About COVID-19? A Question Classification Dataset","summary":" We present COVID-Q, a set of 1,690 questions about COVID-19 from 13 sources,\nwhich we annotate into 15 question categories and 207 question clusters. The\nmost common questions in our dataset asked about transmission, prevention, and\nsocietal effects of COVID, and we found that many questions that appeared in\nmultiple sources were not answered by any FAQ websites of reputable\norganizations such as the CDC and FDA. We post our dataset publicly at\nhttps://github.com/JerryWeiAI/COVID-Q. For classifying questions into 15\ncategories, a BERT baseline scored 58.1% accuracy when trained on 20 examples\nper category, and for a question clustering task, a BERT + triplet loss\nbaseline achieved 49.5% accuracy. We hope COVID-Q can help either for direct\nuse in developing applied systems or as a domain-specific resource for model\nevaluation.\n","authors":["Jerry Wei","Chengyu Huang","Soroush Vosoughi","Jason Wei"],"pdf_url":"https://arxiv.org/pdf/2005.12522v3.pdf","comment":"Published in Proceedings of the 1st Workshop on NLP for COVID-19 at\n ACL 2020"},{"id":"http://arxiv.org/abs/2301.02275v2","updated":"2023-09-08T21:03:14Z","published":"2023-01-05T19:35:30Z","title":"Language as a Latent Sequence: deep latent variable models for\n semi-supervised paraphrase generation","summary":" This paper explores deep latent variable models for semi-supervised\nparaphrase generation, where the missing target pair for unlabelled data is\nmodelled as a latent paraphrase sequence. We present a novel unsupervised model\nnamed variational sequence auto-encoding reconstruction (VSAR), which performs\nlatent sequence inference given an observed text. To leverage information from\ntext pairs, we additionally introduce a novel supervised model we call dual\ndirectional learning (DDL), which is designed to integrate with our proposed\nVSAR model. Combining VSAR with DDL (DDL+VSAR) enables us to conduct\nsemi-supervised learning. Still, the combined model suffers from a cold-start\nproblem. To further combat this issue, we propose an improved weight\ninitialisation solution, leading to a novel two-stage training scheme we call\nknowledge-reinforced-learning (KRL). Our empirical evaluations suggest that the\ncombined model yields competitive performance against the state-of-the-art\nsupervised baselines on complete data. Furthermore, in scenarios where only a\nfraction of the labelled pairs are available, our combined model consistently\noutperforms the strong supervised model baseline (DDL) by a significant margin\n(p <.05; Wilcoxon test). Our code is publicly available at\n\"https://github.com/jialin-yu/latent-sequence-paraphrase\".\n","authors":["Jialin Yu","Alexandra I. Cristea","Anoushka Harit","Zhongtian Sun","Olanrewaju Tahir Aduragba","Lei Shi","Noura Al Moubayed"],"pdf_url":"https://arxiv.org/pdf/2301.02275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04564v1","updated":"2023-09-08T19:34:05Z","published":"2023-09-08T19:34:05Z","title":"When Less is More: Investigating Data Pruning for Pretraining LLMs at\n Scale","summary":" Large volumes of text data have contributed significantly to the development\nof large language models (LLMs) in recent years. This data is typically\nacquired by scraping the internet, leading to pretraining datasets comprised of\nnoisy web text. To date, efforts to prune these datasets down to a higher\nquality subset have relied on hand-crafted heuristics encoded as rule-based\nfilters. In this work, we take a wider view and explore scalable estimates of\ndata quality that can be used to systematically measure the quality of\npretraining data. We perform a rigorous comparison at scale of the simple data\nquality estimator of perplexity, as well as more sophisticated and\ncomputationally intensive estimates of the Error L2-Norm and memorization.\nThese metrics are used to rank and prune pretraining corpora, and we\nsubsequently compare LLMs trained on these pruned datasets. Surprisingly, we\nfind that the simple technique of perplexity outperforms our more\ncomputationally expensive scoring methods. We improve over our no-pruning\nbaseline while training on as little as 30% of the original training dataset.\nOur work sets the foundation for unexplored strategies in automatically\ncurating high quality corpora and suggests the majority of pretraining data can\nbe removed while retaining performance.\n","authors":["Max Marion","Ahmet Üstün","Luiza Pozzobon","Alex Wang","Marzieh Fadaee","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2309.04564v1.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.04561v1","updated":"2023-09-08T19:27:01Z","published":"2023-09-08T19:27:01Z","title":"Three Ways to Improve Verbo-visual Fusion for Dense 3D Visual Grounding","summary":" 3D visual grounding is the task of localizing the object in a 3D scene which\nis referred by a description in natural language. With a wide range of\napplications ranging from autonomous indoor robotics to AR/VR, the task has\nrecently risen in popularity. A common formulation to tackle 3D visual\ngrounding is grounding-by-detection, where localization is done via bounding\nboxes. However, for real-life applications that require physical interactions,\na bounding box insufficiently describes the geometry of an object. We therefore\ntackle the problem of dense 3D visual grounding, i.e. referral-based 3D\ninstance segmentation. We propose a dense 3D grounding network ConcreteNet,\nfeaturing three novel stand-alone modules which aim to improve grounding\nperformance for challenging repetitive instances, i.e. instances with\ndistractors of the same semantic class. First, we introduce a bottom-up\nattentive fusion module that aims to disambiguate inter-instance relational\ncues, next we construct a contrastive training scheme to induce separation in\nthe latent space, and finally we resolve view-dependent utterances via a\nlearned global camera token. ConcreteNet ranks 1st on the challenging ScanRefer\nonline benchmark by a considerable +9.43% accuracy at 50% IoU and has won the\nICCV 3rd Workshop on Language for 3D Scenes \"3D Object Localization\" challenge.\n","authors":["Ozan Unal","Christos Sakaridis","Suman Saha","Fisher Yu","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2309.04561v1.pdf","comment":"Winner of the ICCV 2023 ScanRefer Challenge. This work has been\n submitted to the IEEE for possible publication. Copyright may be transferred\n without notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2309.04550v1","updated":"2023-09-08T18:44:47Z","published":"2023-09-08T18:44:47Z","title":"Retrieving Evidence from EHRs with LLMs: Possibilities and Challenges","summary":" Unstructured Electronic Health Record (EHR) data often contains critical\ninformation complementary to imaging data that would inform radiologists'\ndiagnoses. However, time constraints and the large volume of notes frequently\nassociated with individual patients renders manual perusal of such data to\nidentify relevant evidence infeasible in practice. Modern Large Language Models\n(LLMs) provide a flexible means of interacting with unstructured EHR data, and\nmay provide a mechanism to efficiently retrieve and summarize unstructured\nevidence relevant to a given query. In this work, we propose and evaluate an\nLLM (Flan-T5 XXL) for this purpose. Specifically, in a zero-shot setting we\ntask the LLM to infer whether a patient has or is at risk of a particular\ncondition; if so, we prompt the model to summarize the supporting evidence.\nEnlisting radiologists for manual evaluation, we find that this LLM-based\napproach provides outputs consistently preferred to a standard information\nretrieval baseline, but we also highlight the key outstanding challenge: LLMs\nare prone to hallucinating evidence. However, we provide results indicating\nthat model confidence in outputs might indicate when LLMs are hallucinating,\npotentially providing a means to address this.\n","authors":["Hiba Ahsan","Denis Jered McInerney","Jisoo Kim","Christopher Potter","Geoffrey Young","Silvio Amir","Byron C. Wallace"],"pdf_url":"https://arxiv.org/pdf/2309.04550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03941v2","updated":"2023-09-08T14:51:10Z","published":"2023-07-08T09:28:50Z","title":"Right to be Forgotten in the Era of Large Language Models: Implications,\n Challenges, and Solutions","summary":" The Right to be Forgotten (RTBF) was first established as the result of the\nruling of Google Spain SL, Google Inc. v AEPD, Mario Costeja Gonz\\'alez, and\nwas later included as the Right to Erasure under the General Data Protection\nRegulation (GDPR) of European Union to allow individuals the right to request\npersonal data be deleted by organizations. Specifically for search engines,\nindividuals can send requests to organizations to exclude their information\nfrom the query results. It was a significant emergent right as the result of\nthe evolution of technology. With the recent development of Large Language\nModels (LLMs) and their use in chatbots, LLM-enabled software systems have\nbecome popular. But they are not excluded from the RTBF. Compared with the\nindexing approach used by search engines, LLMs store, and process information\nin a completely different way. This poses new challenges for compliance with\nthe RTBF. In this paper, we explore these challenges and provide our insights\non how to implement technical solutions for the RTBF, including the use of\ndifferential privacy, machine unlearning, model editing, and prompt\nengineering. With the rapid advancement of AI and the increasing need of\nregulating this powerful technology, learning from the case of RTBF can provide\nvaluable lessons for technical practitioners, legal experts, organizations, and\nauthorities.\n","authors":["Dawen Zhang","Pamela Finckenberg-Broman","Thong Hoang","Shidong Pan","Zhenchang Xing","Mark Staples","Xiwei Xu"],"pdf_url":"https://arxiv.org/pdf/2307.03941v2.pdf","comment":"The new version made the following changes: 1. added an \"on-going\n discussion\" section and relevant references 2. added a stream of solutions\n (privacy-preserving machine learning) to technical solutions section 3. made\n minor changes on descriptions of certain technical terms 4. added references\n to some recent law proposals and court rulings"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.04462v1","updated":"2023-09-08T17:50:55Z","published":"2023-09-08T17:50:55Z","title":"Generalized Cross-domain Multi-label Few-shot Learning for Chest X-rays","summary":" Real-world application of chest X-ray abnormality classification requires\ndealing with several challenges: (i) limited training data; (ii) training and\nevaluation sets that are derived from different domains; and (iii) classes that\nappear during training may have partial overlap with classes of interest during\nevaluation. To address these challenges, we present an integrated framework\ncalled Generalized Cross-Domain Multi-Label Few-Shot Learning (GenCDML-FSL).\nThe framework supports overlap in classes during training and evaluation,\ncross-domain transfer, adopts meta-learning to learn using few training\nsamples, and assumes each chest X-ray image is either normal or associated with\none or more abnormalities. Furthermore, we propose Generalized Episodic\nTraining (GenET), a training strategy that equips models to operate with\nmultiple challenges observed in the GenCDML-FSL scenario. Comparisons with\nwell-established methods such as transfer learning, hybrid transfer learning,\nand multi-label meta-learning on multiple datasets show the superiority of our\napproach.\n","authors":["Aroof Aimen","Arsh Verma","Makarand Tapaswi","Narayanan C. Krishnan"],"pdf_url":"https://arxiv.org/pdf/2309.04462v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2309.04461v1","updated":"2023-09-08T17:49:44Z","published":"2023-09-08T17:49:44Z","title":"Measuring and Improving Chain-of-Thought Reasoning in Vision-Language\n Models","summary":" Vision-language models (VLMs) have recently demonstrated strong efficacy as\nvisual assistants that can parse natural queries about the visual content and\ngenerate human-like outputs. In this work, we explore the ability of these\nmodels to demonstrate human-like reasoning based on the perceived information.\nTo address a crucial concern regarding the extent to which their reasoning\ncapabilities are fully consistent and grounded, we also measure the reasoning\nconsistency of these models. We achieve this by proposing a chain-of-thought\n(CoT) based consistency measure. However, such an evaluation requires a\nbenchmark that encompasses both high-level inference and detailed reasoning\nchains, which is costly. We tackle this challenge by proposing a\nLLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously\nensuring the generation of a high-quality dataset. Based on this pipeline and\nthe existing coarse-grained annotated dataset, we build the CURE benchmark to\nmeasure both the zero-shot reasoning performance and consistency of VLMs. We\nevaluate existing state-of-the-art VLMs, and find that even the best-performing\nmodel is unable to demonstrate strong visual reasoning capabilities and\nconsistency, indicating that substantial efforts are required to enable VLMs to\nperform visual reasoning as systematically and consistently as humans. As an\nearly step, we propose a two-stage training framework aimed at improving both\nthe reasoning performance and consistency of VLMs. The first stage involves\nemploying supervised fine-tuning of VLMs using step-by-step reasoning samples\nautomatically generated by LLMs. In the second stage, we further augment the\ntraining process by incorporating feedback provided by LLMs to produce\nreasoning chains that are highly consistent and grounded. We empirically\nhighlight the effectiveness of our framework in both reasoning performance and\nconsistency.\n","authors":["Yangyi Chen","Karan Sikka","Michael Cogswell","Heng Ji","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2309.04461v1.pdf","comment":"The data is released at\n \\url{https://github.com/Yangyi-Chen/CoTConsistency}"},{"id":"http://arxiv.org/abs/2309.04453v1","updated":"2023-09-08T17:22:26Z","published":"2023-09-08T17:22:26Z","title":"WiSARD: A Labeled Visual and Thermal Image Dataset for Wilderness Search\n and Rescue","summary":" Sensor-equipped unoccupied aerial vehicles (UAVs) have the potential to help\nreduce search times and alleviate safety risks for first responders carrying\nout Wilderness Search and Rescue (WiSAR) operations, the process of finding and\nrescuing person(s) lost in wilderness areas. Unfortunately, visual sensors\nalone do not address the need for robustness across all the possible terrains,\nweather, and lighting conditions that WiSAR operations can be conducted in. The\nuse of multi-modal sensors, specifically visual-thermal cameras, is critical in\nenabling WiSAR UAVs to perform in diverse operating conditions. However, due to\nthe unique challenges posed by the wilderness context, existing dataset\nbenchmarks are inadequate for developing vision-based algorithms for autonomous\nWiSAR UAVs. To this end, we present WiSARD, a dataset with roughly 56,000\nlabeled visual and thermal images collected from UAV flights in various\nterrains, seasons, weather, and lighting conditions. To the best of our\nknowledge, WiSARD is the first large-scale dataset collected with multi-modal\nsensors for autonomous WiSAR operations. We envision that our dataset will\nprovide researchers with a diverse and challenging benchmark that can test the\nrobustness of their algorithms when applied to real-world (life-saving)\napplications.\n","authors":["Daniel Broyles","Christopher R. Hayner","Karen Leung"],"pdf_url":"https://arxiv.org/pdf/2309.04453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.02074v2","updated":"2023-09-08T17:15:37Z","published":"2023-06-03T10:35:04Z","title":"A Conditional Generative Chatbot using Transformer Model","summary":" A Chatbot serves as a communication tool between a human user and a machine\nto achieve an appropriate answer based on the human input. In more recent\napproaches, a combination of Natural Language Processing and sequential models\nare used to build a generative Chatbot. The main challenge of these models is\ntheir sequential nature, which leads to less accurate results. To tackle this\nchallenge, in this paper, a novel architecture is proposed using conditional\nWasserstein Generative Adversarial Networks and a transformer model for answer\ngeneration in Chatbots. While the generator of the proposed model consists of a\nfull transformer model to generate an answer, the discriminator includes only\nthe encoder part of a transformer model followed by a classifier. To the best\nof our knowledge, this is the first time that a generative Chatbot is proposed\nusing the embedded transformer in both generator and discriminator models.\nRelying on the parallel computing of the transformer model, the results of the\nproposed model on the Cornell Movie-Dialog corpus and the Chit-Chat datasets\nconfirm the superiority of the proposed model compared to state-of-the-art\nalternatives using different evaluation metrics.\n","authors":["Nura Esfandiari","Kourosh Kiani","Razieh Rastgoo"],"pdf_url":"https://arxiv.org/pdf/2306.02074v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04447v1","updated":"2023-09-08T17:13:22Z","published":"2023-09-08T17:13:22Z","title":"Demographic Disparities in 1-to-Many Facial Identification","summary":" Most studies to date that have examined demographic variations in face\nrecognition accuracy have analyzed 1-to-1 matching accuracy, using images that\ncould be described as \"government ID quality\". This paper analyzes the accuracy\nof 1-to-many facial identification across demographic groups, and in the\npresence of blur and reduced resolution in the probe image as might occur in\n\"surveillance camera quality\" images. Cumulative match characteristic\ncurves(CMC) are not appropriate for comparing propensity for rank-one\nrecognition errors across demographics, and so we introduce three metrics for\nthis: (1) d' metric between mated and non-mated score distributions, (2)\nabsolute score difference between thresholds in the high-similarity tail of the\nnon-mated and the low-similarity tail of the mated distribution, and (3)\ndistribution of (mated - non-mated rank one scores) across the set of probe\nimages. We find that demographic variation in 1-to-many accuracy does not\nentirely follow what has been observed in 1-to-1 matching accuracy. Also,\ndifferent from 1-to-1 accuracy, demographic comparison of 1-to-many accuracy\ncan be affected by different numbers of identities and images across\ndemographics. Finally, we show that increased blur in the probe image, or\nreduced resolution of the face in the probe image, can significantly increase\nthe false positive identification rate. And we show that the demographic\nvariation in these high blur or low resolution conditions is much larger for\nmale/ female than for African-American / Caucasian. The point that 1-to-many\naccuracy can potentially collapse in the context of processing \"surveillance\ncamera quality\" probe images against a \"government ID quality\" gallery is an\nimportant one.\n","authors":["Aman Bhatta","Gabriella Pangelinan","Micheal C. King","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2309.04447v1.pdf","comment":"9 pages, 8 figures, Conference submission"},{"id":"http://arxiv.org/abs/2309.04441v1","updated":"2023-09-08T17:05:24Z","published":"2023-09-08T17:05:24Z","title":"Comparative Study of Visual SLAM-Based Mobile Robot Localization Using\n Fiducial Markers","summary":" This paper presents a comparative study of three modes for mobile robot\nlocalization based on visual SLAM using fiducial markers (i.e., square-shaped\nartificial landmarks with a black-and-white grid pattern): SLAM, SLAM with a\nprior map, and localization with a prior map. The reason for comparing the\nSLAM-based approaches leveraging fiducial markers is because previous work has\nshown their superior performance over feature-only methods, with less\ncomputational burden compared to methods that use both feature and marker\ndetection without compromising the localization performance. The evaluation is\nconducted using indoor image sequences captured with a hand-held camera\ncontaining multiple fiducial markers in the environment. The performance\nmetrics include absolute trajectory error and runtime for the optimization\nprocess per frame. In particular, for the last two modes (SLAM and localization\nwith a prior map), we evaluate their performances by perturbing the quality of\nprior map to study the extent to which each mode is tolerant to such\nperturbations. Hardware experiments show consistent trajectory error levels\nacross the three modes, with the localization mode exhibiting the shortest\nruntime among them. Yet, with map perturbations, SLAM with a prior map\nmaintains performance, while localization mode degrades in both aspects.\n","authors":["Jongwon Lee","Su Yeon Choi","David Hanley","Timothy Bretl"],"pdf_url":"https://arxiv.org/pdf/2309.04441v1.pdf","comment":"IEEE 2023 IROS Workshop \"Closing the Loop on Localization\". For more\n information, see https://oravus.github.io/vpr-workshop/index"},{"id":"http://arxiv.org/abs/2309.04437v1","updated":"2023-09-08T17:01:34Z","published":"2023-09-08T17:01:34Z","title":"Single View Refractive Index Tomography with Neural Fields","summary":" Refractive Index Tomography is an inverse problem in which we seek to\nreconstruct a scene's 3D refractive field from 2D projected image measurements.\nThe refractive field is not visible itself, but instead affects how the path of\na light ray is continuously curved as it travels through space. Refractive\nfields appear across a wide variety of scientific applications, from\ntranslucent cell samples in microscopy to fields of dark matter bending light\nfrom faraway galaxies. This problem poses a unique challenge because the\nrefractive field directly affects the path that light takes, making its\nrecovery a non-linear problem. In addition, in contrast with traditional\ntomography, we seek to recover the refractive field using a projected image\nfrom only a single viewpoint by leveraging knowledge of light sources scattered\nthroughout the medium. In this work, we introduce a method that uses a\ncoordinate-based neural network to model the underlying continuous refractive\nfield in a scene. We then use explicit modeling of rays' 3D spatial curvature\nto optimize the parameters of this network, reconstructing refractive fields\nwith an analysis-by-synthesis approach. The efficacy of our approach is\ndemonstrated by recovering refractive fields in simulation, and analyzing how\nrecovery is affected by the light source distribution. We then test our method\non a simulated dark matter mapping problem, where we recover the refractive\nfield underlying a realistic simulated dark matter distribution.\n","authors":["Brandon Zhao","Aviad Levis","Liam Connor","Pratul P. Srinivasan","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2309.04437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04430v1","updated":"2023-09-08T16:45:56Z","published":"2023-09-08T16:45:56Z","title":"Create Your World: Lifelong Text-to-Image Diffusion","summary":" Text-to-image generative models can produce diverse high-quality images of\nconcepts with a text prompt, which have demonstrated excellent ability in image\ngeneration, image translation, etc. We in this work study the problem of\nsynthesizing instantiations of a use's own concepts in a never-ending manner,\ni.e., create your world, where the new concepts from user are quickly learned\nwith a few examples. To achieve this goal, we propose a Lifelong text-to-image\nDiffusion Model (L2DM), which intends to overcome knowledge \"catastrophic\nforgetting\" for the past encountered concepts, and semantic \"catastrophic\nneglecting\" for one or more concepts in the text prompt. In respect of\nknowledge \"catastrophic forgetting\", our L2DM framework devises a task-aware\nmemory enhancement module and a elastic-concept distillation module, which\ncould respectively safeguard the knowledge of both prior concepts and each past\npersonalized concept. When generating images with a user text prompt, the\nsolution to semantic \"catastrophic neglecting\" is that a concept attention\nartist module can alleviate the semantic neglecting from concept aspect, and an\northogonal attention module can reduce the semantic binding from attribute\naspect. To the end, our model can generate more faithful image across a range\nof continual text prompts in terms of both qualitative and quantitative\nmetrics, when comparing with the related state-of-the-art models. The code will\nbe released at https://wenqiliang.github.io/.\n","authors":["Gan Sun","Wenqi Liang","Jiahua Dong","Jun Li","Zhengming Ding","Yang Cong"],"pdf_url":"https://arxiv.org/pdf/2309.04430v1.pdf","comment":"15 pages,10 figures"},{"id":"http://arxiv.org/abs/2309.04422v1","updated":"2023-09-08T16:33:27Z","published":"2023-09-08T16:33:27Z","title":"Video Task Decathlon: Unifying Image and Video Tasks in Autonomous\n Driving","summary":" Performing multiple heterogeneous visual tasks in dynamic scenes is a\nhallmark of human perception capability. Despite remarkable progress in image\nand video recognition via representation learning, current research still\nfocuses on designing specialized networks for singular, homogeneous, or simple\ncombination of tasks. We instead explore the construction of a unified model\nfor major image and video recognition tasks in autonomous driving with diverse\ninput and output structures. To enable such an investigation, we design a new\nchallenge, Video Task Decathlon (VTD), which includes ten representative image\nand video tasks spanning classification, segmentation, localization, and\nassociation of objects and pixels. On VTD, we develop our unified network,\nVTDNet, that uses a single structure and a single set of weights for all ten\ntasks. VTDNet groups similar tasks and employs task interaction stages to\nexchange information within and between task groups. Given the impracticality\nof labeling all tasks on all frames, and the performance degradation associated\nwith joint training of many tasks, we design a Curriculum training,\nPseudo-labeling, and Fine-tuning (CPF) scheme to successfully train VTDNet on\nall tasks and mitigate performance loss. Armed with CPF, VTDNet significantly\noutperforms its single-task counterparts on most tasks with only 20% overall\ncomputations. VTD is a promising new direction for exploring the unification of\nperception tasks in autonomous driving.\n","authors":["Thomas E. Huang","Yifan Liu","Luc Van Gool","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2309.04422v1.pdf","comment":"ICCV 2023, project page at https://www.vis.xyz/pub/vtd"},{"id":"http://arxiv.org/abs/2309.04421v1","updated":"2023-09-08T16:32:56Z","published":"2023-09-08T16:32:56Z","title":"SynthoGestures: A Novel Framework for Synthetic Dynamic Hand Gesture\n Generation for Driving Scenarios","summary":" Creating a diverse and comprehensive dataset of hand gestures for dynamic\nhuman-machine interfaces in the automotive domain can be challenging and\ntime-consuming. To overcome this challenge, we propose using synthetic gesture\ndatasets generated by virtual 3D models. Our framework utilizes Unreal Engine\nto synthesize realistic hand gestures, offering customization options and\nreducing the risk of overfitting. Multiple variants, including gesture speed,\nperformance, and hand shape, are generated to improve generalizability. In\naddition, we simulate different camera locations and types, such as RGB,\ninfrared, and depth cameras, without incurring additional time and cost to\nobtain these cameras. Experimental results demonstrate that our proposed\nframework,\nSynthoGestures\\footnote{\\url{https://github.com/amrgomaaelhady/SynthoGestures}},\nimproves gesture recognition accuracy and can replace or augment real-hand\ndatasets. By saving time and effort in the creation of the data set, our tool\naccelerates the development of gesture recognition systems for automotive\napplications.\n","authors":["Amr Gomaa","Robin Zitt","Guillermo Reyes","Antonio Krüger"],"pdf_url":"https://arxiv.org/pdf/2309.04421v1.pdf","comment":"Shorter versions are accepted as AutomotiveUI2023 Work in Progress\n and UIST2023 Poster Papers"},{"id":"http://arxiv.org/abs/2111.14093v3","updated":"2023-09-08T16:23:49Z","published":"2021-11-28T10:16:38Z","title":"Adaptive Reordering Sampler with Neurally Guided MAGSAC","summary":" We propose a new sampler for robust estimators that always selects the sample\nwith the highest probability of consisting only of inliers. After every\nunsuccessful iteration, the inlier probabilities are updated in a principled\nway via a Bayesian approach. The probabilities obtained by the deep network are\nused as prior (so-called neural guidance) inside the sampler. Moreover, we\nintroduce a new loss that exploits, in a geometrically justifiable manner, the\norientation and scale that can be estimated for any type of feature, e.g., SIFT\nor SuperPoint, to estimate two-view geometry. The new loss helps to learn\nhigher-order information about the underlying scene geometry. Benefiting from\nthe new sampler and the proposed loss, we combine the neural guidance with the\nstate-of-the-art MAGSAC++. Adaptive Reordering Sampler with Neurally Guided\nMAGSAC (ARS-MAGSAC) is superior to the state-of-the-art in terms of accuracy\nand run-time on the PhotoTourism and KITTI datasets for essential and\nfundamental matrix estimation. The code and trained models are available at\nhttps://github.com/weitong8591/ars_magsac.\n","authors":["Tong Wei","Jiri Matas","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2111.14093v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00359v2","updated":"2023-09-08T16:18:53Z","published":"2023-09-01T09:34:49Z","title":"Large Content And Behavior Models To Understand, Simulate, And Optimize\n Content And Behavior","summary":" Shannon, in his seminal paper introducing information theory, divided the\ncommunication into three levels: technical, semantic, and effectivenss. While\nthe technical level is concerned with accurate reconstruction of transmitted\nsymbols, the semantic and effectiveness levels deal with the inferred meaning\nand its effect on the receiver. Thanks to telecommunications, the first level\nproblem has produced great advances like the internet. Large Language Models\n(LLMs) make some progress towards the second goal, but the third level still\nremains largely untouched. The third problem deals with predicting and\noptimizing communication for desired receiver behavior. LLMs, while showing\nwide generalization capabilities across a wide range of tasks, are unable to\nsolve for this. One reason for the underperformance could be a lack of\n\"behavior tokens\" in LLMs' training corpora. Behavior tokens define receiver\nbehavior over a communication, such as shares, likes, clicks, purchases,\nretweets, etc. While preprocessing data for LLM training, behavior tokens are\noften removed from the corpora as noise. Therefore, in this paper, we make some\ninitial progress towards reintroducing behavior tokens in LLM training. The\ntrained models, other than showing similar performance to LLMs on content\nunderstanding tasks, show generalization capabilities on behavior simulation,\ncontent simulation, behavior understanding, and behavior domain adaptation.\nUsing a wide range of tasks on two corpora, we show results on all these\ncapabilities. We call these models Large Content and Behavior Models (LCBMs).\nFurther, to spur more research on LCBMs, we release our new Content Behavior\nCorpus (CBC), a repository containing communicator, message, and corresponding\nreceiver behavior.\n","authors":["Ashmit Khandelwal","Aditya Agrawal","Aanisha Bhattacharyya","Yaman K Singla","Somesh Singh","Uttaran Bhattacharya","Ishita Dasgupta","Stefano Petrangeli","Rajiv Ratn Shah","Changyou Chen","Balaji Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2309.00359v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04410v1","updated":"2023-09-08T16:17:45Z","published":"2023-09-08T16:17:45Z","title":"DeformToon3D: Deformable 3D Toonification from Neural Radiance Fields","summary":" In this paper, we address the challenging problem of 3D toonification, which\ninvolves transferring the style of an artistic domain onto a target 3D face\nwith stylized geometry and texture. Although fine-tuning a pre-trained 3D GAN\non the artistic domain can produce reasonable performance, this strategy has\nlimitations in the 3D domain. In particular, fine-tuning can deteriorate the\noriginal GAN latent space, which affects subsequent semantic editing, and\nrequires independent optimization and storage for each new style, limiting\nflexibility and efficient deployment. To overcome these challenges, we propose\nDeformToon3D, an effective toonification framework tailored for hierarchical 3D\nGAN. Our approach decomposes 3D toonification into subproblems of geometry and\ntexture stylization to better preserve the original latent space. Specifically,\nwe devise a novel StyleField that predicts conditional 3D deformation to align\na real-space NeRF to the style space for geometry stylization. Thanks to the\nStyleField formulation, which already handles geometry stylization well,\ntexture stylization can be achieved conveniently via adaptive style mixing that\ninjects information of the artistic domain into the decoder of the pre-trained\n3D GAN. Due to the unique design, our method enables flexible style degree\ncontrol and shape-texture-specific style swap. Furthermore, we achieve\nefficient training without any real-world 2D-3D training pairs but proxy\nsamples synthesized from off-the-shelf 2D toonification models.\n","authors":["Junzhe Zhang","Yushi Lan","Shuai Yang","Fangzhou Hong","Quan Wang","Chai Kiat Yeo","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2309.04410v1.pdf","comment":"ICCV 2023. Code: https://github.com/junzhezhang/DeformToon3D Project\n page: https://www.mmlab-ntu.com/project/deformtoon3d/"},{"id":"http://arxiv.org/abs/2309.04399v1","updated":"2023-09-08T15:53:37Z","published":"2023-09-08T15:53:37Z","title":"MaskDiffusion: Boosting Text-to-Image Consistency with Conditional Mask","summary":" Recent advancements in diffusion models have showcased their impressive\ncapacity to generate visually striking images. Nevertheless, ensuring a close\nmatch between the generated image and the given prompt remains a persistent\nchallenge. In this work, we identify that a crucial factor leading to the\ntext-image mismatch issue is the inadequate cross-modality relation learning\nbetween the prompt and the output image. To better align the prompt and image\ncontent, we advance the cross-attention with an adaptive mask, which is\nconditioned on the attention maps and the prompt embeddings, to dynamically\nadjust the contribution of each text token to the image features. This\nmechanism explicitly diminishes the ambiguity in semantic information embedding\nfrom the text encoder, leading to a boost of text-to-image consistency in the\nsynthesized images. Our method, termed MaskDiffusion, is training-free and\nhot-pluggable for popular pre-trained diffusion models. When applied to the\nlatent diffusion models, our MaskDiffusion can significantly improve the\ntext-to-image consistency with negligible computation overhead compared to the\noriginal diffusion models.\n","authors":["Yupeng Zhou","Daquan Zhou","Zuo-Liang Zhu","Yaxing Wang","Qibin Hou","Jiashi Feng"],"pdf_url":"https://arxiv.org/pdf/2309.04399v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03494v2","updated":"2023-09-08T15:38:47Z","published":"2023-09-07T06:09:12Z","title":"Evaluating Deep Learning-based Melanoma Classification using\n Immunohistochemistry and Routine Histology: A Three Center Study","summary":" Pathologists routinely use immunohistochemical (IHC)-stained tissue slides\nagainst MelanA in addition to hematoxylin and eosin (H&E)-stained slides to\nimprove their accuracy in diagnosing melanomas. The use of diagnostic Deep\nLearning (DL)-based support systems for automated examination of tissue\nmorphology and cellular composition has been well studied in standard\nH&E-stained tissue slides. In contrast, there are few studies that analyze IHC\nslides using DL. Therefore, we investigated the separate and joint performance\nof ResNets trained on MelanA and corresponding H&E-stained slides. The MelanA\nclassifier achieved an area under receiver operating characteristics curve\n(AUROC) of 0.82 and 0.74 on out of distribution (OOD)-datasets, similar to the\nH&E-based benchmark classification of 0.81 and 0.75, respectively. A combined\nclassifier using MelanA and H&E achieved AUROCs of 0.85 and 0.81 on the OOD\ndatasets. DL MelanA-based assistance systems show the same performance as the\nbenchmark H&E classification and may be improved by multi stain classification\nto assist pathologists in their clinical routine.\n","authors":["Christoph Wies","Lucas Schneider","Sarah Haggenmueller","Tabea-Clara Bucher","Sarah Hobelsberger","Markus V. Heppt","Gerardo Ferrara","Eva I. Krieghoff-Henning","Titus J. Brinker"],"pdf_url":"https://arxiv.org/pdf/2309.03494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.13185v3","updated":"2023-09-08T15:35:47Z","published":"2022-12-26T15:13:13Z","title":"Generalized Differentiable RANSAC","summary":" We propose $\\nabla$-RANSAC, a generalized differentiable RANSAC that allows\nlearning the entire randomized robust estimation pipeline. The proposed\napproach enables the use of relaxation techniques for estimating the gradients\nin the sampling distribution, which are then propagated through a\ndifferentiable solver. The trainable quality function marginalizes over the\nscores from all the models estimated within $\\nabla$-RANSAC to guide the\nnetwork learning accurate and useful inlier probabilities or to train feature\ndetection and matching networks. Our method directly maximizes the probability\nof drawing a good hypothesis, allowing us to learn better sampling\ndistributions. We test $\\nabla$-RANSAC on various real-world scenarios on\nfundamental and essential matrix estimation, and 3D point cloud registration,\noutdoors and indoors, with handcrafted and learning-based features. It is\nsuperior to the state-of-the-art in terms of accuracy while running at a\nsimilar speed to its less accurate alternatives. The code and trained models\nare available at https://github.com/weitong8591/differentiable_ransac.\n","authors":["Tong Wei","Yash Patel","Alexander Shekhovtsov","Jiri Matas","Daniel Barath"],"pdf_url":"https://arxiv.org/pdf/2212.13185v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04379v1","updated":"2023-09-08T15:21:07Z","published":"2023-09-08T15:21:07Z","title":"Language Prompt for Autonomous Driving","summary":" A new trend in the computer vision community is to capture objects of\ninterest following flexible human command represented by a natural language\nprompt. However, the progress of using language prompts in driving scenarios is\nstuck in a bottleneck due to the scarcity of paired prompt-instance data. To\naddress this challenge, we propose the first object-centric language prompt set\nfor driving scenes within 3D, multi-view, and multi-frame space, named\nNuPrompt. It expands Nuscenes dataset by constructing a total of 35,367\nlanguage descriptions, each referring to an average of 5.3 object tracks. Based\non the object-text pairs from the new benchmark, we formulate a new\nprompt-based driving task, \\ie, employing a language prompt to predict the\ndescribed object trajectory across views and frames. Furthermore, we provide a\nsimple end-to-end baseline model based on Transformer, named PromptTrack.\nExperiments show that our PromptTrack achieves impressive performance on\nNuPrompt. We hope this work can provide more new insights for the autonomous\ndriving community. Dataset and Code will be made public at\n\\href{https://github.com/wudongming97/Prompt4Driving}{https://github.com/wudongming97/Prompt4Driving}.\n","authors":["Dongming Wu","Wencheng Han","Tiancai Wang","Yingfei Liu","Xiangyu Zhang","Jianbing Shen"],"pdf_url":"https://arxiv.org/pdf/2309.04379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12649v2","updated":"2023-09-08T15:17:05Z","published":"2023-03-22T15:30:44Z","title":"MI-SegNet: Mutual Information-Based US Segmentation for Unseen Domain\n Generalization","summary":" Generalization capabilities of learning-based medical image segmentation\nacross domains are currently limited by the performance degradation caused by\nthe domain shift, particularly for ultrasound (US) imaging. The quality of US\nimages heavily relies on carefully tuned acoustic parameters, which vary across\nsonographers, machines, and settings. To improve the generalizability on US\nimages across domains, we propose MI-SegNet, a novel mutual information (MI)\nbased framework to explicitly disentangle the anatomical and domain feature\nrepresentations; therefore, robust domain-independent segmentation can be\nexpected. Two encoders are employed to extract the relevant features for the\ndisentanglement. The segmentation only uses the anatomical feature map for its\nprediction. In order to force the encoders to learn meaningful feature\nrepresentations a cross-reconstruction method is used during training.\nTransformations, specific to either domain or anatomy are applied to guide the\nencoders in their respective feature extraction task. Additionally, any MI\npresent in both feature maps is punished to further promote separate feature\nspaces. We validate the generalizability of the proposed domain-independent\nsegmentation approach on several datasets with varying parameters and machines.\nFurthermore, we demonstrate the effectiveness of the proposed MI-SegNet serving\nas a pre-trained model by comparing it with state-of-the-art networks.\n","authors":["Yuan Bi","Zhongliang Jiang","Ricarda Clarenbach","Reza Ghotbi","Angelos Karlas","Nassir Navab"],"pdf_url":"https://arxiv.org/pdf/2303.12649v2.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.04372v1","updated":"2023-09-08T15:06:05Z","published":"2023-09-08T15:06:05Z","title":"MoEController: Instruction-based Arbitrary Image Manipulation with\n Mixture-of-Expert Controllers","summary":" Diffusion-model-based text-guided image generation has recently made\nastounding progress, producing fascinating results in open-domain image\nmanipulation tasks. Few models, however, currently have complete zero-shot\ncapabilities for both global and local image editing due to the complexity and\ndiversity of image manipulation tasks. In this work, we propose a method with a\nmixture-of-expert (MOE) controllers to align the text-guided capacity of\ndiffusion models with different kinds of human instructions, enabling our model\nto handle various open-domain image manipulation tasks with natural language\ninstructions. First, we use large language models (ChatGPT) and conditional\nimage synthesis models (ControlNet) to generate a large number of global image\ntransfer dataset in addition to the instruction-based local image editing\ndataset. Then, using an MOE technique and task-specific adaptation training on\na large-scale dataset, our conditional diffusion model can edit images globally\nand locally. Extensive experiments demonstrate that our approach performs\nsurprisingly well on various image manipulation tasks when dealing with\nopen-domain images and arbitrary human instructions. Please refer to our\nproject page: [https://oppo-mente-lab.github.io/moe_controller/]\n","authors":["Sijia Li","Chen Chen","Haonan Lu"],"pdf_url":"https://arxiv.org/pdf/2309.04372v1.pdf","comment":"5 pages,6 figures"},{"id":"http://arxiv.org/abs/2309.04366v1","updated":"2023-09-08T14:53:00Z","published":"2023-09-08T14:53:00Z","title":"CNN Injected Transformer for Image Exposure Correction","summary":" Capturing images with incorrect exposure settings fails to deliver a\nsatisfactory visual experience. Only when the exposure is properly set, can the\ncolor and details of the images be appropriately preserved. Previous exposure\ncorrection methods based on convolutions often produce exposure deviation in\nimages as a consequence of the restricted receptive field of convolutional\nkernels. This issue arises because convolutions are not capable of capturing\nlong-range dependencies in images accurately. To overcome this challenge, we\ncan apply the Transformer to address the exposure correction problem,\nleveraging its capability in modeling long-range dependencies to capture global\nrepresentation. However, solely relying on the window-based Transformer leads\nto visually disturbing blocking artifacts due to the application of\nself-attention in small patches. In this paper, we propose a CNN Injected\nTransformer (CIT) to harness the individual strengths of CNN and Transformer\nsimultaneously. Specifically, we construct the CIT by utilizing a window-based\nTransformer to exploit the long-range interactions among different regions in\nthe entire image. Within each CIT block, we incorporate a channel attention\nblock (CAB) and a half-instance normalization block (HINB) to assist the\nwindow-based self-attention to acquire the global statistics and refine local\nfeatures. In addition to the hybrid architecture design for exposure\ncorrection, we apply a set of carefully formulated loss functions to improve\nthe spatial coherence and rectify potential color deviations. Extensive\nexperiments demonstrate that our image exposure correction method outperforms\nstate-of-the-art approaches in terms of both quantitative and qualitative\nmetrics.\n","authors":["Shuning Xu","Xiangyu Chen","Binbin Song","Jiantao Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.04366v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04357v1","updated":"2023-09-08T14:28:28Z","published":"2023-09-08T14:28:28Z","title":"SSIG: A Visually-Guided Graph Edit Distance for Floor Plan Similarity","summary":" We propose a simple yet effective metric that measures structural similarity\nbetween visual instances of architectural floor plans, without the need for\nlearning. Qualitatively, our experiments show that the retrieval results are\nsimilar to deeply learned methods. Effectively comparing instances of floor\nplan data is paramount to the success of machine understanding of floor plan\ndata, including the assessment of floor plan generative models and floor plan\nrecommendation systems. Comparing visual floor plan images goes beyond a sole\npixel-wise visual examination and is crucially about similarities and\ndifferences in the shapes and relations between subdivisions that compose the\nlayout. Currently, deep metric learning approaches are used to learn a\npair-wise vector representation space that closely mimics the structural\nsimilarity, in which the models are trained on similarity labels that are\nobtained by Intersection-over-Union (IoU). To compensate for the lack of\nstructural awareness in IoU, graph-based approaches such as Graph Matching\nNetworks (GMNs) are used, which require pairwise inference for comparing data\ninstances, making GMNs less practical for retrieval applications. In this\npaper, an effective evaluation metric for judging the structural similarity of\nfloor plans, coined SSIG (Structural Similarity by IoU and GED), is proposed\nbased on both image and graph distances. In addition, an efficient algorithm is\ndeveloped that uses SSIG to rank a large-scale floor plan database. Code will\nbe openly available.\n","authors":["Casper van Engelenburg","Seyran Khademi","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2309.04357v1.pdf","comment":"To be published in ICCVW 2023, 10 pages"},{"id":"http://arxiv.org/abs/2309.04354v1","updated":"2023-09-08T14:24:10Z","published":"2023-09-08T14:24:10Z","title":"Mobile V-MoEs: Scaling Down Vision Transformers via Sparse\n Mixture-of-Experts","summary":" Sparse Mixture-of-Experts models (MoEs) have recently gained popularity due\nto their ability to decouple model size from inference efficiency by only\nactivating a small subset of the model parameters for any given input token. As\nsuch, sparse MoEs have enabled unprecedented scalability, resulting in\ntremendous successes across domains such as natural language processing and\ncomputer vision. In this work, we instead explore the use of sparse MoEs to\nscale-down Vision Transformers (ViTs) to make them more attractive for\nresource-constrained vision applications. To this end, we propose a simplified\nand mobile-friendly MoE design where entire images rather than individual\npatches are routed to the experts. We also propose a stable MoE training\nprocedure that uses super-class information to guide the router. We empirically\nshow that our sparse Mobile Vision MoEs (V-MoEs) can achieve a better trade-off\nbetween performance and efficiency than the corresponding dense ViTs. For\nexample, for the ViT-Tiny model, our Mobile V-MoE outperforms its dense\ncounterpart by 3.39% on ImageNet-1k. For an even smaller ViT variant with only\n54M FLOPs inference cost, our MoE achieves an improvement of 4.66%.\n","authors":["Erik Daxberger","Floris Weers","Bowen Zhang","Tom Gunter","Ruoming Pang","Marcin Eichner","Michael Emmersberger","Yinfei Yang","Alexander Toshev","Xianzhi Du"],"pdf_url":"https://arxiv.org/pdf/2309.04354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00066v2","updated":"2023-09-08T14:15:50Z","published":"2023-08-31T18:13:01Z","title":"SoDaCam: Software-defined Cameras via Single-Photon Imaging","summary":" Reinterpretable cameras are defined by their post-processing capabilities\nthat exceed traditional imaging. We present \"SoDaCam\" that provides\nreinterpretable cameras at the granularity of photons, from photon-cubes\nacquired by single-photon devices. Photon-cubes represent the spatio-temporal\ndetections of photons as a sequence of binary frames, at frame-rates as high as\n100 kHz. We show that simple transformations of the photon-cube, or photon-cube\nprojections, provide the functionality of numerous imaging systems including:\nexposure bracketing, flutter shutter cameras, video compressive systems, event\ncameras, and even cameras that move during exposure. Our photon-cube\nprojections offer the flexibility of being software-defined constructs that are\nonly limited by what is computable, and shot-noise. We exploit this flexibility\nto provide new capabilities for the emulated cameras. As an added benefit, our\nprojections provide camera-dependent compression of photon-cubes, which we\ndemonstrate using an implementation of our projections on a novel compute\narchitecture that is designed for single-photon imaging.\n","authors":["Varun Sundar","Andrei Ardelean","Tristan Swedish","Claudio Bruschini","Edoardo Charbon","Mohit Gupta"],"pdf_url":"https://arxiv.org/pdf/2309.00066v2.pdf","comment":"Accepted at ICCV 2023 (oral). Project webpage can be found at\n https://wisionlab.com/project/sodacam/"},{"id":"http://arxiv.org/abs/2309.04342v1","updated":"2023-09-08T14:12:03Z","published":"2023-09-08T14:12:03Z","title":"Revealing the preference for correcting separated aberrations in joint\n optic-image design","summary":" The joint design of the optical system and the downstream algorithm is a\nchallenging and promising task. Due to the demand for balancing the global\noptimal of imaging systems and the computational cost of physical simulation,\nexisting methods cannot achieve efficient joint design of complex systems such\nas smartphones and drones. In this work, starting from the perspective of the\noptical design, we characterize the optics with separated aberrations.\nAdditionally, to bridge the hardware and software without gradients, an image\nsimulation system is presented to reproduce the genuine imaging procedure of\nlenses with large field-of-views. As for aberration correction, we propose a\nnetwork to perceive and correct the spatially varying aberrations and validate\nits superiority over state-of-the-art methods. Comprehensive experiments reveal\nthat the preference for correcting separated aberrations in joint design is as\nfollows: longitudinal chromatic aberration, lateral chromatic aberration,\nspherical aberration, field curvature, and coma, with astigmatism coming last.\nDrawing from the preference, a 10% reduction in the total track length of the\nconsumer-level mobile phone lens module is accomplished. Moreover, this\nprocedure spares more space for manufacturing deviations, realizing\nextreme-quality enhancement of computational photography. The optimization\nparadigm provides innovative insight into the practical joint design of\nsophisticated optical systems and post-processing algorithms.\n","authors":["Jingwen Zhou","Shiqi Chen","Zheng Ren","Wenguan Zhang","Jiapu Yan","Huajun Feng","Qi Li","Yueting Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04331v1","updated":"2023-09-08T13:55:16Z","published":"2023-09-08T13:55:16Z","title":"Leveraging Model Fusion for Improved License Plate Recognition","summary":" License Plate Recognition (LPR) plays a critical role in various\napplications, such as toll collection, parking management, and traffic law\nenforcement. Although LPR has witnessed significant advancements through the\ndevelopment of deep learning, there has been a noticeable lack of studies\nexploring the potential improvements in results by fusing the outputs from\nmultiple recognition models. This research aims to fill this gap by\ninvestigating the combination of up to 12 different models using\nstraightforward approaches, such as selecting the most confident prediction or\nemploying majority vote-based strategies. Our experiments encompass a wide\nrange of datasets, revealing substantial benefits of fusion approaches in both\nintra- and cross-dataset setups. Essentially, fusing multiple models reduces\nconsiderably the likelihood of obtaining subpar performance on a particular\ndataset/scenario. We also found that combining models based on their speed is\nan appealing approach. Specifically, for applications where the recognition\ntask can tolerate some additional time, though not excessively, an effective\nstrategy is to combine 4-6 models. These models may not be the most accurate\nindividually, but their fusion strikes an optimal balance between accuracy and\nspeed.\n","authors":["Rayson Laroca","Luiz A. Zanlorensi","Valter Estevam","Rodrigo Minetto","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2309.04331v1.pdf","comment":"Accepted for presentation at the Iberoamerican Congress on Pattern\n Recognition (CIARP) 2023"},{"id":"http://arxiv.org/abs/2309.04312v1","updated":"2023-09-08T13:18:10Z","published":"2023-09-08T13:18:10Z","title":"AMLP:Adaptive Masking Lesion Patches for Self-supervised Medical Image\n Segmentation","summary":" Self-supervised masked image modeling has shown promising results on natural\nimages. However, directly applying such methods to medical images remains\nchallenging. This difficulty stems from the complexity and distinct\ncharacteristics of lesions compared to natural images, which impedes effective\nrepresentation learning. Additionally, conventional high fixed masking ratios\nrestrict reconstructing fine lesion details, limiting the scope of learnable\ninformation. To tackle these limitations, we propose a novel self-supervised\nmedical image segmentation framework, Adaptive Masking Lesion Patches (AMLP).\nSpecifically, we design a Masked Patch Selection (MPS) strategy to identify and\nfocus learning on patches containing lesions. Lesion regions are scarce yet\ncritical, making their precise reconstruction vital. To reduce\nmisclassification of lesion and background patches caused by unsupervised\nclustering in MPS, we introduce an Attention Reconstruction Loss (ARL) to focus\non hard-to-reconstruct patches likely depicting lesions. We further propose a\nCategory Consistency Loss (CCL) to refine patch categorization based on\nreconstruction difficulty, strengthening distinction between lesions and\nbackground. Moreover, we develop an Adaptive Masking Ratio (AMR) strategy that\ngradually increases the masking ratio to expand reconstructible information and\nimprove learning. Extensive experiments on two medical segmentation datasets\ndemonstrate AMLP's superior performance compared to existing self-supervised\napproaches. The proposed strategies effectively address limitations in applying\nmasked modeling to medical images, tailored to capturing fine lesion details\nvital for segmentation tasks.\n","authors":["Xiangtao Wang","Ruizhi Wang","Jie Zhou","Thomas Lukasiewicz","Zhenghua Xu"],"pdf_url":"https://arxiv.org/pdf/2309.04312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06603v2","updated":"2023-09-08T13:03:24Z","published":"2023-08-12T16:14:44Z","title":"LadleNet: Translating Thermal Infrared Images to Visible Light Images\n Using A Scalable Two-stage U-Net","summary":" The translation of thermal infrared (TIR) images to visible light (VI) images\npresents a challenging task with potential applications spanning various\ndomains such as TIR-VI image registration and fusion. Leveraging supplementary\ninformation derived from TIR image conversions can significantly enhance model\nperformance and generalization across these applications. However, prevailing\nissues within this field include suboptimal image fidelity and limited model\nscalability. In this paper, we introduce an algorithm, LadleNet, based on the\nU-Net architecture. LadleNet employs a two-stage U-Net concatenation structure,\naugmented with skip connections and refined feature aggregation techniques,\nresulting in a substantial enhancement in model performance. Comprising\n'Handle' and 'Bowl' modules, LadleNet's Handle module facilitates the\nconstruction of an abstract semantic space, while the Bowl module decodes this\nsemantic space to yield mapped VI images. The Handle module exhibits\nextensibility by allowing the substitution of its network architecture with\nsemantic segmentation networks, thereby establishing more abstract semantic\nspaces to bolster model performance. Consequently, we propose LadleNet+, which\nreplaces LadleNet's Handle module with the pre-trained DeepLabv3+ network,\nthereby endowing the model with enhanced semantic space construction\ncapabilities. The proposed method is evaluated and tested on the KAIST dataset,\naccompanied by quantitative and qualitative analyses. Compared to existing\nmethodologies, our approach achieves state-of-the-art performance in terms of\nimage clarity and perceptual quality. The source code will be made available at\nhttps://github.com/Ach-1914/LadleNet/tree/main/.\n","authors":["Tonghui Zou","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04302v1","updated":"2023-09-08T13:02:36Z","published":"2023-09-08T13:02:36Z","title":"Have We Ever Encountered This Before? Retrieving Out-of-Distribution\n Road Obstacles from Driving Scenes","summary":" In the life cycle of highly automated systems operating in an open and\ndynamic environment, the ability to adjust to emerging challenges is crucial.\nFor systems integrating data-driven AI-based components, rapid responses to\ndeployment issues require fast access to related data for testing and\nreconfiguration. In the context of automated driving, this especially applies\nto road obstacles that were not included in the training data, commonly\nreferred to as out-of-distribution (OoD) road obstacles. Given the availability\nof large uncurated recordings of driving scenes, a pragmatic approach is to\nquery a database to retrieve similar scenarios featuring the same safety\nconcerns due to OoD road obstacles. In this work, we extend beyond identifying\nOoD road obstacles in video streams and offer a comprehensive approach to\nextract sequences of OoD road obstacles using text queries, thereby proposing a\nway of curating a collection of OoD data for subsequent analysis. Our proposed\nmethod leverages the recent advances in OoD segmentation and multi-modal\nfoundation models to identify and efficiently extract safety-relevant scenes\nfrom unlabeled videos. We present a first approach for the novel task of\ntext-based OoD object retrieval, which addresses the question ''Have we ever\nencountered this before?''.\n","authors":["Youssef Shoeb","Robin Chan","Gesina Schwalbe","Azarm Nowzard","Fatma Güney","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2309.04302v1.pdf","comment":"11 pages, 7 figures, and 3 tables"},{"id":"http://arxiv.org/abs/2301.02667v2","updated":"2023-09-08T12:52:09Z","published":"2023-01-09T18:59:16Z","title":"Locomotion-Action-Manipulation: Synthesizing Human-Scene Interactions in\n Complex 3D Environments","summary":" Synthesizing interaction-involved human motions has been challenging due to\nthe high complexity of 3D environments and the diversity of possible human\nbehaviors within. We present LAMA, Locomotion-Action-MAnipulation, to\nsynthesize natural and plausible long-term human movements in complex indoor\nenvironments. The key motivation of LAMA is to build a unified framework to\nencompass a series of everyday motions including locomotion, scene interaction,\nand object manipulation. Unlike existing methods that require motion data\n\"paired\" with scanned 3D scenes for supervision, we formulate the problem as a\ntest-time optimization by using human motion capture data only for synthesis.\nLAMA leverages a reinforcement learning framework coupled with a motion\nmatching algorithm for optimization, and further exploits a motion editing\nframework via manifold learning to cover possible variations in interaction and\nmanipulation. Throughout extensive experiments, we demonstrate that LAMA\noutperforms previous approaches in synthesizing realistic motions in various\nchallenging scenarios. Project page: https://jiyewise.github.io/projects/LAMA/ .\n","authors":["Jiye Lee","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2301.02667v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2302.10798v3","updated":"2023-09-08T12:36:08Z","published":"2023-02-17T09:37:17Z","title":"Learning a Consensus Sub-Network with Polarization Regularization and\n One Pass Training","summary":" The subject of green AI has been gaining attention within the deep learning\ncommunity given the recent trend of ever larger and more complex neural network\nmodels. Existing solutions for reducing the computational load of training at\ninference time usually involve pruning the network parameters. Pruning schemes\noften create extra overhead either by iterative training and fine-tuning for\nstatic pruning or repeated computation of a dynamic pruning graph. We propose a\nnew parameter pruning strategy for learning a lighter-weight sub-network that\nminimizes the energy cost while maintaining comparable performance to the fully\nparameterised network on given downstream tasks. Our proposed pruning scheme is\ngreen-oriented, as it only requires a one-off training to discover the optimal\nstatic sub-networks by dynamic pruning methods. The pruning scheme consists of\na binary gating module and a novel loss function to uncover sub-networks with\nuser-defined sparsity. Our method enables pruning and training simultaneously,\nwhich saves energy in both the training and inference phases and avoids extra\ncomputational overhead from gating modules at inference time. Our results on\nCIFAR-10 and CIFAR-100 suggest that our scheme can remove 50% of connections in\ndeep networks with less than 1% reduction in classification accuracy. Compared\nto other related pruning methods, our method demonstrates a lower drop in\naccuracy for equivalent reductions in computational cost.\n","authors":["Xiaoying Zhi","Varun Babbar","Pheobe Sun","Fran Silavong","Ruibo Shi","Sean Moran"],"pdf_url":"https://arxiv.org/pdf/2302.10798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04293v1","updated":"2023-09-08T12:28:40Z","published":"2023-09-08T12:28:40Z","title":"How Can We Tame the Long-Tail of Chest X-ray Datasets?","summary":" Chest X-rays (CXRs) are a medical imaging modality that is used to infer a\nlarge number of abnormalities. While it is hard to define an exhaustive list of\nthese abnormalities, which may co-occur on a chest X-ray, few of them are quite\ncommonly observed and are abundantly represented in CXR datasets used to train\ndeep learning models for automated inference. However, it is challenging for\ncurrent models to learn independent discriminatory features for labels that are\nrare but may be of high significance. Prior works focus on the combination of\nmulti-label and long tail problems by introducing novel loss functions or some\nmechanism of re-sampling or re-weighting the data. Instead, we propose that it\nis possible to achieve significant performance gains merely by choosing an\ninitialization for a model that is closer to the domain of the target dataset.\nThis method can complement the techniques proposed in existing literature, and\ncan easily be scaled to new labels. Finally, we also examine the veracity of\nsynthetically generated data to augment the tail labels and analyse its\ncontribution to improving model performance.\n","authors":["Arsh Verma"],"pdf_url":"https://arxiv.org/pdf/2309.04293v1.pdf","comment":"Extended Abstract presented at Computer Vision for Automated Medical\n Diagnosis Workshop at the International Conference on Computer Vision 2023,\n October 2nd 2023, Paris, France, & Virtual, https://cvamd2023.github.io, 7\n pages"},{"id":"http://arxiv.org/abs/2305.02086v2","updated":"2023-09-08T11:58:41Z","published":"2023-05-03T12:44:20Z","title":"Revisiting the Encoding of Satellite Image Time Series","summary":" Satellite Image Time Series (SITS) representation learning is complex due to\nhigh spatiotemporal resolutions, irregular acquisition times, and intricate\nspatiotemporal interactions. These challenges result in specialized neural\nnetwork architectures tailored for SITS analysis. The field has witnessed\npromising results achieved by pioneering researchers, but transferring the\nlatest advances or established paradigms from Computer Vision (CV) to SITS is\nstill highly challenging due to the existing suboptimal representation learning\nframework. In this paper, we develop a novel perspective of SITS processing as\na direct set prediction problem, inspired by the recent trend in adopting\nquery-based transformer decoders to streamline the object detection or image\nsegmentation pipeline. We further propose to decompose the representation\nlearning process of SITS into three explicit steps: collect-update-distribute,\nwhich is computationally efficient and suits for irregularly-sampled and\nasynchronous temporal satellite observations. Facilitated by the unique\nreformulation, our proposed temporal learning backbone of SITS, initially\npre-trained on the resource efficient pixel-set format and then fine-tuned on\nthe downstream dense prediction tasks, has attained new state-of-the-art (SOTA)\nresults on the PASTIS benchmark dataset. Specifically, the clear separation\nbetween temporal and spatial components in the semantic/panoptic segmentation\npipeline of SITS makes us leverage the latest advances in CV, such as the\nuniversal image segmentation architecture, resulting in a noticeable 2.5 points\nincrease in mIoU and 8.8 points increase in PQ, respectively, compared to the\nbest scores reported so far.\n","authors":["Xin Cai","Yaxin Bi","Peter Nicholl","Roy Sterritt"],"pdf_url":"https://arxiv.org/pdf/2305.02086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12900v3","updated":"2023-09-08T11:55:57Z","published":"2023-07-24T15:47:21Z","title":"Automotive Object Detection via Learning Sparse Events by Spiking\n Neurons","summary":" Event-based sensors, distinguished by their high temporal resolution of\n1$\\mathrm{\\mu s}$ and a dynamic range of 120$\\mathrm{dB}$, stand out as ideal\ntools for deployment in fast-paced settings like vehicles and drones.\nTraditional object detection techniques that utilize Artificial Neural Networks\n(ANNs) face challenges due to the sparse and asynchronous nature of the events\nthese sensors capture. In contrast, Spiking Neural Networks (SNNs) offer a\npromising alternative, providing a temporal representation that is inherently\naligned with event-based data. This paper explores the unique membrane\npotential dynamics of SNNs and their ability to modulate sparse events. We\nintroduce an innovative spike-triggered adaptive threshold mechanism designed\nfor stable training. Building on these insights, we present a specialized\nspiking feature pyramid network (SpikeFPN) optimized for automotive event-based\nobject detection. Comprehensive evaluations demonstrate that SpikeFPN surpasses\nboth traditional SNNs and advanced ANNs enhanced with attention mechanisms.\nEvidently, SpikeFPN achieves a mean Average Precision (mAP) of 0.477 on the\n{GEN1 Automotive Detection (GAD)} benchmark dataset, marking a significant\nincrease of 9.7\\% over the previous best SNN. Moreover, the efficient design of\nSpikeFPN ensures robust performance while optimizing computational resources,\nattributed to its innate sparse computation capabilities.\n","authors":["Hu Zhang","Yanchen Li","Luziwei Leng","Kaiwei Che","Qian Liu","Qinghai Guo","Jianxing Liao","Ran Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.12900v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04247v1","updated":"2023-09-08T10:26:29Z","published":"2023-09-08T10:26:29Z","title":"Towards Practical Capture of High-Fidelity Relightable Avatars","summary":" In this paper, we propose a novel framework, Tracking-free Relightable Avatar\n(TRAvatar), for capturing and reconstructing high-fidelity 3D avatars. Compared\nto previous methods, TRAvatar works in a more practical and efficient setting.\nSpecifically, TRAvatar is trained with dynamic image sequences captured in a\nLight Stage under varying lighting conditions, enabling realistic relighting\nand real-time animation for avatars in diverse scenes. Additionally, TRAvatar\nallows for tracking-free avatar capture and obviates the need for accurate\nsurface tracking under varying illumination conditions. Our contributions are\ntwo-fold: First, we propose a novel network architecture that explicitly builds\non and ensures the satisfaction of the linear nature of lighting. Trained on\nsimple group light captures, TRAvatar can predict the appearance in real-time\nwith a single forward pass, achieving high-quality relighting effects under\nilluminations of arbitrary environment maps. Second, we jointly optimize the\nfacial geometry and relightable appearance from scratch based on image\nsequences, where the tracking is implicitly learned. This tracking-free\napproach brings robustness for establishing temporal correspondences between\nframes under different lighting conditions. Extensive qualitative and\nquantitative experiments demonstrate that our framework achieves superior\nperformance for photorealistic avatar animation and relighting.\n","authors":["Haotian Yang","Mingwu Zheng","Wanquan Feng","Haibin Huang","Yu-Kun Lai","Pengfei Wan","Zhongyuan Wang","Chongyang Ma"],"pdf_url":"https://arxiv.org/pdf/2309.04247v1.pdf","comment":"Accepted to SIGGRAPH Asia 2023 (Conference); Project page:\n https://travatar-paper.github.io/"},{"id":"http://arxiv.org/abs/2212.07207v4","updated":"2023-09-08T10:10:23Z","published":"2022-12-14T13:10:27Z","title":"MAELi: Masked Autoencoder for Large-Scale LiDAR Point Clouds","summary":" The sensing process of large-scale LiDAR point clouds inevitably causes large\nblind spots, i.e. regions not visible to the sensor. We demonstrate how these\ninherent sampling properties can be effectively utilized for self-supervised\nrepresentation learning by designing a highly effective pre-training framework\nthat considerably reduces the need for tedious 3D annotations to train\nstate-of-the-art object detectors. Our Masked AutoEncoder for LiDAR point\nclouds (MAELi) intuitively leverages the sparsity of LiDAR point clouds in both\nthe encoder and decoder during reconstruction. This results in more expressive\nand useful initialization, which can be directly applied to downstream\nperception tasks, such as 3D object detection or semantic segmentation for\nautonomous driving. In a novel reconstruction approach, MAELi distinguishes\nbetween empty and occluded space and employs a new masking strategy that\ntargets the LiDAR's inherent spherical projection. Thereby, without any ground\ntruth whatsoever and trained on single frames only, MAELi obtains an\nunderstanding of the underlying 3D scene geometry and semantics. To demonstrate\nthe potential of MAELi, we pre-train backbones in an end-to-end manner and show\nthe effectiveness of our unsupervised pre-trained weights on the tasks of 3D\nobject detection and semantic segmentation.\n","authors":["Georg Krispel","David Schinagl","Christian Fruhwirth-Reisinger","Horst Possegger","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2212.07207v4.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2309.04228v1","updated":"2023-09-08T09:34:48Z","published":"2023-09-08T09:34:48Z","title":"FIVA: Facial Image and Video Anonymization and Anonymization Defense","summary":" In this paper, we present a new approach for facial anonymization in images\nand videos, abbreviated as FIVA. Our proposed method is able to maintain the\nsame face anonymization consistently over frames with our suggested\nidentity-tracking and guarantees a strong difference from the original face.\nFIVA allows for 0 true positives for a false acceptance rate of 0.001. Our work\nconsiders the important security issue of reconstruction attacks and\ninvestigates adversarial noise, uniform noise, and parameter noise to disrupt\nreconstruction attacks. In this regard, we apply different defense and\nprotection methods against these privacy threats to demonstrate the scalability\nof FIVA. On top of this, we also show that reconstruction attack models can be\nused for detection of deep fakes. Last but not least, we provide experimental\nresults showing how FIVA can even enable face swapping, which is purely trained\non a single target image.\n","authors":["Felix Rosberg","Eren Erdal Aksoy","Cristofer Englund","Fernando Alonso-Fernandez"],"pdf_url":"https://arxiv.org/pdf/2309.04228v1.pdf","comment":"Accepted to ICCVW 2023 - DFAD 2023"},{"id":"http://arxiv.org/abs/2309.04225v1","updated":"2023-09-08T09:19:18Z","published":"2023-09-08T09:19:18Z","title":"Long-Range Correlation Supervision for Land-Cover Classification from\n Remote Sensing Images","summary":" Long-range dependency modeling has been widely considered in modern deep\nlearning based semantic segmentation methods, especially those designed for\nlarge-size remote sensing images, to compensate the intrinsic locality of\nstandard convolutions. However, in previous studies, the long-range dependency,\nmodeled with an attention mechanism or transformer model, has been based on\nunsupervised learning, instead of explicit supervision from the objective\nground truth. In this paper, we propose a novel supervised long-range\ncorrelation method for land-cover classification, called the supervised\nlong-range correlation network (SLCNet), which is shown to be superior to the\ncurrently used unsupervised strategies. In SLCNet, pixels sharing the same\ncategory are considered highly correlated and those having different categories\nare less relevant, which can be easily supervised by the category consistency\ninformation available in the ground truth semantic segmentation map. Under such\nsupervision, the recalibrated features are more consistent for pixels of the\nsame category and more discriminative for pixels of other categories,\nregardless of their proximity. To complement the detailed information lacking\nin the global long-range correlation, we introduce an auxiliary adaptive\nreceptive field feature extraction module, parallel to the long-range\ncorrelation module in the encoder, to capture finely detailed feature\nrepresentations for multi-size objects in multi-scale remote sensing images. In\naddition, we apply multi-scale side-output supervision and a hybrid loss\nfunction as local and global constraints to further boost the segmentation\naccuracy. Experiments were conducted on three remote sensing datasets. Compared\nwith the advanced segmentation methods from the computer vision, medicine, and\nremote sensing communities, the SLCNet achieved a state-of-the-art performance\non all the datasets.\n","authors":["Dawen Yu","Shunping Ji"],"pdf_url":"https://arxiv.org/pdf/2309.04225v1.pdf","comment":"14 pages, 11 figures"},{"id":"http://arxiv.org/abs/2304.01534v2","updated":"2023-09-08T09:10:03Z","published":"2023-04-04T05:14:25Z","title":"FedBEVT: Federated Learning Bird's Eye View Perception Transformer in\n Road Traffic Systems","summary":" Bird's eye view (BEV) perception is becoming increasingly important in the\nfield of autonomous driving. It uses multi-view camera data to learn a\ntransformer model that directly projects the perception of the road environment\nonto the BEV perspective. However, training a transformer model often requires\na large amount of data, and as camera data for road traffic are often private,\nthey are typically not shared. Federated learning offers a solution that\nenables clients to collaborate and train models without exchanging data but\nmodel parameters. In this paper, we introduce FedBEVT, a federated transformer\nlearning approach for BEV perception. In order to address two common data\nheterogeneity issues in FedBEVT: (i) diverse sensor poses, and (ii) varying\nsensor numbers in perception systems, we propose two approaches -- Federated\nLearning with Camera-Attentive Personalization (FedCaP) and Adaptive\nMulti-Camera Masking (AMCM), respectively. To evaluate our method in real-world\nsettings, we create a dataset consisting of four typical federated use cases.\nOur findings suggest that FedBEVT outperforms the baseline approaches in all\nfour use cases, demonstrating the potential of our approach for improving BEV\nperception in autonomous driving.\n","authors":["Rui Song","Runsheng Xu","Andreas Festag","Jiaqi Ma","Alois Knoll"],"pdf_url":"https://arxiv.org/pdf/2304.01534v2.pdf","comment":"Accepted by IEEE T-IV. Code: https://github.com/rruisong/FedBEVT"},{"id":"http://arxiv.org/abs/2309.04220v1","updated":"2023-09-08T09:10:03Z","published":"2023-09-08T09:10:03Z","title":"Score-PA: Score-based 3D Part Assembly","summary":" Autonomous 3D part assembly is a challenging task in the areas of robotics\nand 3D computer vision. This task aims to assemble individual components into a\ncomplete shape without relying on predefined instructions. In this paper, we\nformulate this task from a novel generative perspective, introducing the\nScore-based 3D Part Assembly framework (Score-PA) for 3D part assembly. Knowing\nthat score-based methods are typically time-consuming during the inference\nstage. To address this issue, we introduce a novel algorithm called the Fast\nPredictor-Corrector Sampler (FPC) that accelerates the sampling process within\nthe framework. We employ various metrics to assess assembly quality and\ndiversity, and our evaluation results demonstrate that our algorithm\noutperforms existing state-of-the-art approaches. We release our code at\nhttps://github.com/J-F-Cheng/Score-PA_Score-based-3D-Part-Assembly.\n","authors":["Junfeng Cheng","Mingdong Wu","Ruiyuan Zhang","Guanqi Zhan","Chao Wu","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.04220v1.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2303.11219v4","updated":"2023-09-08T08:44:01Z","published":"2023-03-20T15:50:00Z","title":"NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion\n Aware Refraction-Tracing","summary":" We present a novel method, called NeTO, for capturing 3D geometry of solid\ntransparent objects from 2D images via volume rendering. Reconstructing\ntransparent objects is a very challenging task, which is ill-suited for\ngeneral-purpose reconstruction techniques due to the specular light transport\nphenomena. Although existing refraction-tracing based methods, designed\nspecially for this task, achieve impressive results, they still suffer from\nunstable optimization and loss of fine details, since the explicit surface\nrepresentation they adopted is difficult to be optimized, and the\nself-occlusion problem is ignored for refraction-tracing. In this paper, we\npropose to leverage implicit Signed Distance Function (SDF) as surface\nrepresentation, and optimize the SDF field via volume rendering with a\nself-occlusion aware refractive ray tracing. The implicit representation\nenables our method to be capable of reconstructing high-quality reconstruction\neven with a limited set of images, and the self-occlusion aware strategy makes\nit possible for our method to accurately reconstruct the self-occluded regions.\nExperiments show that our method achieves faithful reconstruction results and\noutperforms prior works by a large margin. Visit our project page at\nhttps://www.xxlong.site/NeTO/\n","authors":["Zongcheng Li","Xiaoxiao Long","Yusen Wang","Tuo Cao","Wenping Wang","Fei Luo","Chunxia Xiao"],"pdf_url":"https://arxiv.org/pdf/2303.11219v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04190v1","updated":"2023-09-08T08:03:42Z","published":"2023-09-08T08:03:42Z","title":"SegmentAnything helps microscopy images based automatic and quantitative\n organoid detection and analysis","summary":" Organoids are self-organized 3D cell clusters that closely mimic the\narchitecture and function of in vivo tissues and organs. Quantification of\norganoid morphology helps in studying organ development, drug discovery, and\ntoxicity assessment. Recent microscopy techniques provide a potent tool to\nacquire organoid morphology features, but manual image analysis remains a labor\nand time-intensive process. Thus, this paper proposes a comprehensive pipeline\nfor microscopy analysis that leverages the SegmentAnything to precisely\ndemarcate individual organoids. Additionally, we introduce a set of\nmorphological properties, including perimeter, area, radius, non-smoothness,\nand non-circularity, allowing researchers to analyze the organoid structures\nquantitatively and automatically. To validate the effectiveness of our\napproach, we conducted tests on bright-field images of human induced\npluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The\nresults obtained from our automatic pipeline closely align with manual organoid\ndetection and measurement, showcasing the capability of our proposed method in\naccelerating organoids morphology analysis.\n","authors":["Xiaodan Xing","Chunling Tang","Yunzhe Guo","Nicholas Kurniawan","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04190v1.pdf","comment":"submitted to SPIE: Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2309.04183v1","updated":"2023-09-08T07:53:58Z","published":"2023-09-08T07:53:58Z","title":"Stereo Matching in Time: 100+ FPS Video Stereo Matching for Extended\n Reality","summary":" Real-time Stereo Matching is a cornerstone algorithm for many Extended\nReality (XR) applications, such as indoor 3D understanding, video pass-through,\nand mixed-reality games. Despite significant advancements in deep stereo\nmethods, achieving real-time depth inference with high accuracy on a low-power\ndevice remains a major challenge. One of the major difficulties is the lack of\nhigh-quality indoor video stereo training datasets captured by head-mounted\nVR/AR glasses. To address this issue, we introduce a novel video stereo\nsynthetic dataset that comprises photorealistic renderings of various indoor\nscenes and realistic camera motion captured by a 6-DoF moving VR/AR\nhead-mounted display (HMD). This facilitates the evaluation of existing\napproaches and promotes further research on indoor augmented reality scenarios.\nOur newly proposed dataset enables us to develop a novel framework for\ncontinuous video-rate stereo matching.\n As another contribution, our dataset enables us to proposed a new video-based\nstereo matching approach tailored for XR applications, which achieves real-time\ninference at an impressive 134fps on a standard desktop computer, or 30fps on a\nbattery-powered HMD. Our key insight is that disparity and contextual\ninformation are highly correlated and redundant between consecutive stereo\nframes. By unrolling an iterative cost aggregation in time (i.e. in the\ntemporal dimension), we are able to distribute and reuse the aggregated\nfeatures over time. This approach leads to a substantial reduction in\ncomputation without sacrificing accuracy. We conducted extensive evaluations\nand comparisons and demonstrated that our method achieves superior performance\ncompared to the current state-of-the-art, making it a strong contender for\nreal-time stereo matching in VR/AR applications.\n","authors":["Ziang Cheng","Jiayu Yang","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2309.04183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03531v2","updated":"2023-09-08T07:43:50Z","published":"2023-09-07T07:26:27Z","title":"A Robust Negative Learning Approach to Partial Domain Adaptation Using\n Source Prototypes","summary":" This work proposes a robust Partial Domain Adaptation (PDA) framework that\nmitigates the negative transfer problem by incorporating a robust\ntarget-supervision strategy. It leverages ensemble learning and includes\ndiverse, complementary label feedback, alleviating the effect of incorrect\nfeedback and promoting pseudo-label refinement. Rather than relying exclusively\non first-order moments for distribution alignment, our approach offers explicit\nobjectives to optimize intra-class compactness and inter-class separation with\nthe inferred source prototypes and highly-confident target samples in a\ndomain-invariant fashion. Notably, we ensure source data privacy by eliminating\nthe need to access the source data during the adaptation phase through a priori\ninference of source prototypes. We conducted a series of comprehensive\nexperiments, including an ablation analysis, covering a range of partial domain\nadaptation tasks. Comprehensive evaluations on benchmark datasets corroborate\nour framework's enhanced robustness and generalization, demonstrating its\nsuperiority over existing state-of-the-art PDA approaches.\n","authors":["Sandipan Choudhuri","Suli Adeniye","Arunabha Sen"],"pdf_url":"https://arxiv.org/pdf/2309.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04172v1","updated":"2023-09-08T07:38:52Z","published":"2023-09-08T07:38:52Z","title":"Unsupervised Object Localization with Representer Point Selection","summary":" We propose a novel unsupervised object localization method that allows us to\nexplain the predictions of the model by utilizing self-supervised pre-trained\nmodels without additional finetuning. Existing unsupervised and self-supervised\nobject localization methods often utilize class-agnostic activation maps or\nself-similarity maps of a pre-trained model. Although these maps can offer\nvaluable information for localization, their limited ability to explain how the\nmodel makes predictions remains challenging. In this paper, we propose a simple\nyet effective unsupervised object localization method based on representer\npoint selection, where the predictions of the model can be represented as a\nlinear combination of representer values of training points. By selecting\nrepresenter points, which are the most important examples for the model\npredictions, our model can provide insights into how the model predicts the\nforeground object by providing relevant examples as well as their importance.\nOur method outperforms the state-of-the-art unsupervised and self-supervised\nobject localization methods on various datasets with significant margins and\neven outperforms recent weakly supervised and few-shot methods.\n","authors":["Yeonghwan Song","Seokwoo Jang","Dina Katabi","Jeany Son"],"pdf_url":"https://arxiv.org/pdf/2309.04172v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2309.04171v1","updated":"2023-09-08T07:37:15Z","published":"2023-09-08T07:37:15Z","title":"PRISTA-Net: Deep Iterative Shrinkage Thresholding Network for Coded\n Diffraction Patterns Phase Retrieval","summary":" The problem of phase retrieval (PR) involves recovering an unknown image from\nlimited amplitude measurement data and is a challenge nonlinear inverse problem\nin computational imaging and image processing. However, many of the PR methods\nare based on black-box network models that lack interpretability and\nplug-and-play (PnP) frameworks that are computationally complex and require\ncareful parameter tuning. To address this, we have developed PRISTA-Net, a deep\nunfolding network (DUN) based on the first-order iterative shrinkage\nthresholding algorithm (ISTA). This network utilizes a learnable nonlinear\ntransformation to address the proximal-point mapping sub-problem associated\nwith the sparse priors, and an attention mechanism to focus on phase\ninformation containing image edges, textures, and structures. Additionally, the\nfast Fourier transform (FFT) is used to learn global features to enhance local\ninformation, and the designed logarithmic-based loss function leads to\nsignificant improvements when the noise level is low. All parameters in the\nproposed PRISTA-Net framework, including the nonlinear transformation,\nthreshold parameters, and step size, are learned end-to-end instead of being\nmanually set. This method combines the interpretability of traditional methods\nwith the fast inference ability of deep learning and is able to handle noise at\neach iteration during the unfolding stage, thus improving recovery quality.\nExperiments on Coded Diffraction Patterns (CDPs) measurements demonstrate that\nour approach outperforms the existing state-of-the-art methods in terms of\nqualitative and quantitative evaluations. Our source codes are available at\n\\emph{https://github.com/liuaxou/PRISTA-Net}.\n","authors":["Aoxu Liu","Xiaohong Fan","Yin Yang","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04171v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2309.04169v1","updated":"2023-09-08T07:22:54Z","published":"2023-09-08T07:22:54Z","title":"Grouping Boundary Proposals for Fast Interactive Image Segmentation","summary":" Geodesic models are known as an efficient tool for solving various image\nsegmentation problems. Most of existing approaches only exploit local pointwise\nimage features to track geodesic paths for delineating the objective\nboundaries. However, such a segmentation strategy cannot take into account the\nconnectivity of the image edge features, increasing the risk of shortcut\nproblem, especially in the case of complicated scenario. In this work, we\nintroduce a new image segmentation model based on the minimal geodesic\nframework in conjunction with an adaptive cut-based circular optimal path\ncomputation scheme and a graph-based boundary proposals grouping scheme.\nSpecifically, the adaptive cut can disconnect the image domain such that the\ntarget contours are imposed to pass through this cut only once. The boundary\nproposals are comprised of precomputed image edge segments, providing the\nconnectivity information for our segmentation model. These boundary proposals\nare then incorporated into the proposed image segmentation model, such that the\ntarget segmentation contours are made up of a set of selected boundary\nproposals and the corresponding geodesic paths linking them. Experimental\nresults show that the proposed model indeed outperforms state-of-the-art\nminimal paths-based image segmentation approaches.\n","authors":["Li Liu","Da Chen","Minglei Shu","Laurent D. Cohen"],"pdf_url":"https://arxiv.org/pdf/2309.04169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10616v2","updated":"2023-09-08T07:19:22Z","published":"2023-07-20T06:32:14Z","title":"Heterogeneous Federated Learning: State-of-the-art and Research\n Challenges","summary":" Federated learning (FL) has drawn increasing attention owing to its potential\nuse in large-scale industrial applications. Existing federated learning works\nmainly focus on model homogeneous settings. However, practical federated\nlearning typically faces the heterogeneity of data distributions, model\narchitectures, network environments, and hardware devices among participant\nclients. Heterogeneous Federated Learning (HFL) is much more challenging, and\ncorresponding solutions are diverse and complex. Therefore, a systematic survey\non this topic about the research challenges and state-of-the-art is essential.\nIn this survey, we firstly summarize the various research challenges in HFL\nfrom five aspects: statistical heterogeneity, model heterogeneity,\ncommunication heterogeneity, device heterogeneity, and additional challenges.\nIn addition, recent advances in HFL are reviewed and a new taxonomy of existing\nHFL methods is proposed with an in-depth analysis of their pros and cons. We\nclassify existing methods from three different levels according to the HFL\nprocedure: data-level, model-level, and server-level. Finally, several critical\nand promising future research directions in HFL are discussed, which may\nfacilitate further developments in this field. A periodically updated\ncollection on HFL is available at https://github.com/marswhu/HFL_Survey.\n","authors":["Mang Ye","Xiuwen Fang","Bo Du","Pong C. Yuen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.10616v2.pdf","comment":"42 pages, 11 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2309.01361v2","updated":"2023-09-08T07:14:45Z","published":"2023-09-04T05:05:15Z","title":"High Frequency, High Accuracy Pointing onboard Nanosats using\n Neuromorphic Event Sensing and Piezoelectric Actuation","summary":" As satellites become smaller, the ability to maintain stable pointing\ndecreases as external forces acting on the satellite come into play. At the\nsame time, reaction wheels used in the attitude determination and control\nsystem (ADCS) introduce high frequency jitter which can disrupt pointing\nstability. For space domain awareness (SDA) tasks that track objects tens of\nthousands of kilometres away, the pointing accuracy offered by current\nnanosats, typically in the range of 10 to 100 arcseconds, is not sufficient. In\nthis work, we develop a novel payload that utilises a neuromorphic event sensor\n(for high frequency and highly accurate relative attitude estimation) paired in\na closed loop with a piezoelectric stage (for active attitude corrections) to\nprovide highly stable sensor-specific pointing. Event sensors are especially\nsuited for space applications due to their desirable characteristics of low\npower consumption, asynchronous operation, and high dynamic range. We use the\nevent sensor to first estimate a reference background star field from which\ninstantaneous relative attitude is estimated at high frequency. The\npiezoelectric stage works in a closed control loop with the event sensor to\nperform attitude corrections based on the discrepancy between the current and\ndesired attitude. Results in a controlled setting show that we can achieve a\npointing accuracy in the range of 1-5 arcseconds using our novel payload at an\noperating frequency of up to 50Hz using a prototype built from\ncommercial-off-the-shelf components. Further details can be found at\nhttps://ylatif.github.io/ultrafinestabilisation\n","authors":["Yasir Latif","Peter Anastasiou","Yonhon Ng","Zebb Prime","Tien-Fu Lu","Matthew Tetlow","Robert Mahony","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2309.01361v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04158v1","updated":"2023-09-08T06:51:15Z","published":"2023-09-08T06:51:15Z","title":"Context-Aware Prompt Tuning for Vision-Language Model with\n Dual-Alignment","summary":" Large-scale vision-language models (VLMs), e.g., CLIP, learn broad visual\nconcepts from tedious training data, showing superb generalization ability.\nAmount of prompt learning methods have been proposed to efficiently adapt the\nVLMs to downstream tasks with only a few training samples. We introduce a novel\nmethod to improve the prompt learning of vision-language models by\nincorporating pre-trained large language models (LLMs), called Dual-Aligned\nPrompt Tuning (DuAl-PT). Learnable prompts, like CoOp, implicitly model the\ncontext through end-to-end training, which are difficult to control and\ninterpret. While explicit context descriptions generated by LLMs, like GPT-3,\ncan be directly used for zero-shot classification, such prompts are overly\nrelying on LLMs and still underexplored in few-shot domains. With DuAl-PT, we\npropose to learn more context-aware prompts, benefiting from both explicit and\nimplicit context modeling. To achieve this, we introduce a pre-trained LLM to\ngenerate context descriptions, and we encourage the prompts to learn from the\nLLM's knowledge by alignment, as well as the alignment between prompts and\nlocal image features. Empirically, DuAl-PT achieves superior performance on 11\ndownstream datasets on few-shot recognition and base-to-new generalization.\nHopefully, DuAl-PT can serve as a strong baseline. Code will be available.\n","authors":["Hongyu Hu","Tiancheng Lin","Jie Wang","Zhenbang Sun","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2309.04158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04153v1","updated":"2023-09-08T06:37:25Z","published":"2023-09-08T06:37:25Z","title":"Mapping EEG Signals to Visual Stimuli: A Deep Learning Approach to Match\n vs. Mismatch Classification","summary":" Existing approaches to modeling associations between visual stimuli and brain\nresponses are facing difficulties in handling between-subject variance and\nmodel generalization. Inspired by the recent progress in modeling speech-brain\nresponse, we propose in this work a ``match-vs-mismatch'' deep learning model\nto classify whether a video clip induces excitatory responses in recorded EEG\nsignals and learn associations between the visual content and corresponding\nneural recordings. Using an exclusive experimental dataset, we demonstrate that\nthe proposed model is able to achieve the highest accuracy on unseen subjects\nas compared to other baseline models. Furthermore, we analyze the inter-subject\nnoise using a subject-level silhouette score in the embedding space and show\nthat the developed model is able to mitigate inter-subject noise and\nsignificantly reduce the silhouette score. Moreover, we examine the Grad-CAM\nactivation score and show that the brain regions associated with language\nprocessing contribute most to the model predictions, followed by regions\nassociated with visual processing. These results have the potential to\nfacilitate the development of neural recording-based video reconstruction and\nits related applications.\n","authors":["Yiqian Yang","Zhengqiao Zhao","Qian Wang","Yan Yang","Jingdong Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04148v1","updated":"2023-09-08T06:24:44Z","published":"2023-09-08T06:24:44Z","title":"Representation Synthesis by Probabilistic Many-Valued Logic Operation in\n Self-Supervised Learning","summary":" Self-supervised learning (SSL) using mixed images has been studied to learn\nvarious image representations. Existing methods using mixed images learn a\nrepresentation by maximizing the similarity between the representation of the\nmixed image and the synthesized representation of the original images. However,\nfew methods consider the synthesis of representations from the perspective of\nmathematical logic. In this study, we focused on a synthesis method of\nrepresentations. We proposed a new SSL with mixed images and a new\nrepresentation format based on many-valued logic. This format can indicate the\nfeature-possession degree, that is, how much of each image feature is possessed\nby a representation. This representation format and representation synthesis by\nlogic operation realize that the synthesized representation preserves the\nremarkable characteristics of the original representations. Our method\nperformed competitively with previous representation synthesis methods for\nimage classification tasks. We also examined the relationship between the\nfeature-possession degree and the number of classes of images in the multilabel\nimage classification dataset to verify that the intended learning was achieved.\nIn addition, we discussed image retrieval, which is an application of our\nproposed representation format using many-valued logic.\n","authors":["Hiroki Nakamura","Masashi Okada","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2309.04148v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.04147v1","updated":"2023-09-08T06:24:17Z","published":"2023-09-08T06:24:17Z","title":"Robot Localization and Mapping Final Report -- Sequential Adversarial\n Learning for Self-Supervised Deep Visual Odometry","summary":" Visual odometry (VO) and SLAM have been using multi-view geometry via local\nstructure from motion for decades. These methods have a slight disadvantage in\nchallenging scenarios such as low-texture images, dynamic scenarios, etc.\nMeanwhile, use of deep neural networks to extract high level features is\nubiquitous in computer vision. For VO, we can use these deep networks to\nextract depth and pose estimates using these high level features. The visual\nodometry task then can be modeled as an image generation task where the pose\nestimation is the by-product. This can also be achieved in a self-supervised\nmanner, thereby eliminating the data (supervised) intensive nature of training\ndeep neural networks. Although some works tried the similar approach [1], the\ndepth and pose estimation in the previous works are vague sometimes resulting\nin accumulation of error (drift) along the trajectory. The goal of this work is\nto tackle these limitations of past approaches and to develop a method that can\nprovide better depths and pose estimates. To address this, a couple of\napproaches are explored: 1) Modeling: Using optical flow and recurrent neural\nnetworks (RNN) in order to exploit spatio-temporal correlations which can\nprovide more information to estimate depth. 2) Loss function: Generative\nadversarial network (GAN) [2] is deployed to improve the depth estimation (and\nthereby pose too), as shown in Figure 1. This additional loss term improves the\nrealism in generated images and reduces artifacts.\n","authors":["Akankshya Kar","Sajal Maheshwari","Shamit Lal","Vinay Sameer Raja Kad"],"pdf_url":"https://arxiv.org/pdf/2309.04147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04145v1","updated":"2023-09-08T06:15:27Z","published":"2023-09-08T06:15:27Z","title":"Depth Completion with Multiple Balanced Bases and Confidence for Dense\n Monocular SLAM","summary":" Dense SLAM based on monocular cameras does indeed have immense application\nvalue in the field of AR/VR, especially when it is performed on a mobile\ndevice. In this paper, we propose a novel method that integrates a light-weight\ndepth completion network into a sparse SLAM system using a multi-basis depth\nrepresentation, so that dense mapping can be performed online even on a mobile\nphone. Specifically, we present a specifically optimized multi-basis depth\ncompletion network, called BBC-Net, tailored to the characteristics of\ntraditional sparse SLAM systems. BBC-Net can predict multiple balanced bases\nand a confidence map from a monocular image with sparse points generated by\noff-the-shelf keypoint-based SLAM systems. The final depth is a linear\ncombination of predicted depth bases that can be optimized by tuning the\ncorresponding weights. To seamlessly incorporate the weights into traditional\nSLAM optimization and ensure efficiency and robustness, we design a set of\ndepth weight factors, which makes our network a versatile plug-in module,\nfacilitating easy integration into various existing sparse SLAM systems and\nsignificantly enhancing global depth consistency through bundle adjustment. To\nverify the portability of our method, we integrate BBC-Net into two\nrepresentative SLAM systems. The experimental results on various datasets show\nthat the proposed method achieves better performance in monocular dense mapping\nthan the state-of-the-art methods. We provide an online demo running on a\nmobile phone, which verifies the efficiency and mapping quality of the proposed\nmethod in real-world scenarios.\n","authors":["Weijian Xie","Guanyi Chu","Quanhao Qian","Yihao Yu","Hai Li","Danpeng Chen","Shangjin Zhai","Nan Wang","Hujun Bao","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.11437v4","updated":"2023-09-08T05:41:47Z","published":"2022-03-22T03:17:15Z","title":"Representation Uncertainty in Self-Supervised Learning as Variational\n Inference","summary":" In this study, a novel self-supervised learning (SSL) method is proposed,\nwhich considers SSL in terms of variational inference to learn not only\nrepresentation but also representation uncertainties. SSL is a method of\nlearning representations without labels by maximizing the similarity between\nimage representations of different augmented views of an image. Meanwhile,\nvariational autoencoder (VAE) is an unsupervised representation learning method\nthat trains a probabilistic generative model with variational inference. Both\nVAE and SSL can learn representations without labels, but their relationship\nhas not been investigated in the past. Herein, the theoretical relationship\nbetween SSL and variational inference has been clarified. Furthermore, a novel\nmethod, namely variational inference SimSiam (VI-SimSiam), has been proposed.\nVI-SimSiam can predict the representation uncertainty by interpreting SimSiam\nwith variational inference and defining the latent space distribution. The\npresent experiments qualitatively show that VI- SimSiam could learn uncertainty\nby comparing input images and predicted uncertainties. Additionally, we\ndescribed a relationship between estimated uncertainty and classification\naccuracy.\n","authors":["Hiroki Nakamura","Masashi Okada","Tadahiro Taniguchi"],"pdf_url":"https://arxiv.org/pdf/2203.11437v4.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.04109v1","updated":"2023-09-08T04:10:01Z","published":"2023-09-08T04:10:01Z","title":"From Text to Mask: Localizing Entities Using the Attention of\n Text-to-Image Diffusion Models","summary":" Diffusion models have revolted the field of text-to-image generation\nrecently. The unique way of fusing text and image information contributes to\ntheir remarkable capability of generating highly text-related images. From\nanother perspective, these generative models imply clues about the precise\ncorrelation between words and pixels. In this work, a simple but effective\nmethod is proposed to utilize the attention mechanism in the denoising network\nof text-to-image diffusion models. Without re-training nor inference-time\noptimization, the semantic grounding of phrases can be attained directly. We\nevaluate our method on Pascal VOC 2012 and Microsoft COCO 2014 under\nweakly-supervised semantic segmentation setting and our method achieves\nsuperior performance to prior methods. In addition, the acquired word-pixel\ncorrelation is found to be generalizable for the learned text embedding of\ncustomized generation methods, requiring only a few modifications. To validate\nour discovery, we introduce a new practical task called \"personalized referring\nimage segmentation\" with a new dataset. Experiments in various situations\ndemonstrate the advantages of our method compared to strong baselines on this\ntask. In summary, our work reveals a novel way to extract the rich multi-modal\nknowledge hidden in diffusion models for segmentation.\n","authors":["Changming Xiao","Qi Yang","Feng Zhou","Changshui Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04105v1","updated":"2023-09-08T03:56:34Z","published":"2023-09-08T03:56:34Z","title":"Weakly Supervised Point Clouds Transformer for 3D Object Detection","summary":" The annotation of 3D datasets is required for semantic-segmentation and\nobject detection in scene understanding. In this paper we present a framework\nfor the weakly supervision of a point clouds transformer that is used for 3D\nobject detection. The aim is to decrease the required amount of supervision\nneeded for training, as a result of the high cost of annotating a 3D datasets.\nWe propose an Unsupervised Voting Proposal Module, which learns randomly preset\nanchor points and uses voting network to select prepared anchor points of high\nquality. Then it distills information into student and teacher network. In\nterms of student network, we apply ResNet network to efficiently extract local\ncharacteristics. However, it also can lose much global information. To provide\nthe input which incorporates the global and local information as the input of\nstudent networks, we adopt the self-attention mechanism of transformer to\nextract global features, and the ResNet layers to extract region proposals. The\nteacher network supervises the classification and regression of the student\nnetwork using the pre-trained model on ImageNet. On the challenging KITTI\ndatasets, the experimental results have achieved the highest level of average\nprecision compared with the most recent weakly supervised 3D object detectors.\n","authors":["Zuojin Tang","Bo Sun","Tongwei Ma","Daosheng Li","Zhenhui Xu"],"pdf_url":"https://arxiv.org/pdf/2309.04105v1.pdf","comment":"International Conference on Intelligent Transportation Systems\n (ITSC), 2022"},{"id":"http://arxiv.org/abs/2309.04089v1","updated":"2023-09-08T02:58:17Z","published":"2023-09-08T02:58:17Z","title":"Toward Sufficient Spatial-Frequency Interaction for Gradient-aware\n Underwater Image Enhancement","summary":" Underwater images suffer from complex and diverse degradation, which\ninevitably affects the performance of underwater visual tasks. However, most\nexisting learning-based Underwater image enhancement (UIE) methods mainly\nrestore such degradations in the spatial domain, and rarely pay attention to\nthe fourier frequency information. In this paper, we develop a novel UIE\nframework based on spatial-frequency interaction and gradient maps, namely\nSFGNet, which consists of two stages. Specifically, in the first stage, we\npropose a dense spatial-frequency fusion network (DSFFNet), mainly including\nour designed dense fourier fusion block and dense spatial fusion block,\nachieving sufficient spatial-frequency interaction by cross connections between\nthese two blocks. In the second stage, we propose a gradient-aware corrector\n(GAC) to further enhance perceptual details and geometric structures of images\nby gradient map. Experimental results on two real-world underwater image\ndatasets show that our approach can successfully enhance underwater images, and\nachieves competitive performance in visual quality improvement.\n","authors":["Chen Zhao","Weiling Cai","Chenyu Dong","Ziqi Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.04089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04084v1","updated":"2023-09-08T02:50:54Z","published":"2023-09-08T02:50:54Z","title":"Towards Efficient SDRTV-to-HDRTV by Learning from Image Formation","summary":" Modern displays are capable of rendering video content with high dynamic\nrange (HDR) and wide color gamut (WCG). However, the majority of available\nresources are still in standard dynamic range (SDR). As a result, there is\nsignificant value in transforming existing SDR content into the HDRTV standard.\nIn this paper, we define and analyze the SDRTV-to-HDRTV task by modeling the\nformation of SDRTV/HDRTV content. Our analysis and observations indicate that a\nnaive end-to-end supervised training pipeline suffers from severe gamut\ntransition errors. To address this issue, we propose a novel three-step\nsolution pipeline called HDRTVNet++, which includes adaptive global color\nmapping, local enhancement, and highlight refinement. The adaptive global color\nmapping step uses global statistics as guidance to perform image-adaptive color\nmapping. A local enhancement network is then deployed to enhance local details.\nFinally, we combine the two sub-networks above as a generator and achieve\nhighlight consistency through GAN-based joint training. Our method is primarily\ndesigned for ultra-high-definition TV content and is therefore effective and\nlightweight for processing 4K resolution images. We also construct a dataset\nusing HDR videos in the HDR10 standard, named HDRTV1K that contains 1235 and\n117 training images and 117 testing images, all in 4K resolution. Besides, we\nselect five metrics to evaluate the results of SDRTV-to-HDRTV algorithms. Our\nfinal results demonstrate state-of-the-art performance both quantitatively and\nvisually. The code, model and dataset are available at\nhttps://github.com/xiaom233/HDRTVNet-plus.\n","authors":["Xiangyu Chen","Zheyuan Li","Zhengwen Zhang","Jimmy S. Ren","Yihao Liu","Jingwen He","Yu Qiao","Jiantao Zhou","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.04084v1.pdf","comment":"Extended version of HDRTVNet"},{"id":"http://arxiv.org/abs/2305.03689v2","updated":"2023-09-08T02:46:19Z","published":"2023-05-05T17:00:16Z","title":"COLA: A Benchmark for Compositional Text-to-image Retrieval","summary":" Compositional reasoning is a hallmark of human visual intelligence; yet\ndespite the size of large vision-language models, they struggle to represent\nsimple compositions by combining objects with their attributes. To measure this\nlack of compositional capability, we design Cola, a text-to-image retrieval\nbenchmark to Compose Objects Localized with Attributes. To solve Cola, a model\nmust retrieve images with the correct configuration of attributes and objects,\nand avoid choosing a distractor image with the same objects and attributes but\nin the wrong configuration. Cola contains about 1.2k composed queries of 168\nobjects and 197 attributes on around 30K images. Our human evaluation finds\nthat Cola is 83.33% accurate, similar to contemporary compositionality\nbenchmarks. Using Cola as a testbed, we explore empirical modeling designs to\nadapt pre-trained vision-language models to reason compositionally. We explore\n6 adaptation strategies on 2 seminal vision-language models, using\ncompositionality-centric test benchmarks - Cola and CREPE. We find the optimal\nadaptation strategy is to train a multimodal attention layer that jointly\nattends over the frozen pre-trained image and language features. Surprisingly,\ntraining multimodal layers on CLIP performs better than tuning a larger FLAVA\nmodel with already pre-trained multimodal layers. Furthermore, our adaptation\nstrategy improves CLIP and FLAVA to comparable levels, suggesting that training\nmultimodal layers using contrastive attribute-object data is key, as opposed to\nusing them pre-trained. Lastly, we show that Cola is harder than a closely\nrelated contemporary benchmark, CREPE, since simpler fine-tuning strategies\nwithout multimodal layers suffice on CREPE, but not on Cola. However, we still\nsee a significant gap between our best adaptation and human accuracy,\nsuggesting considerable room for further research.\n","authors":["Arijit Ray","Filip Radenovic","Abhimanyu Dubey","Bryan A. Plummer","Ranjay Krishna","Kate Saenko"],"pdf_url":"https://arxiv.org/pdf/2305.03689v2.pdf","comment":"Under review. Webpage: https://github.com/arijitray1993/COLA"},{"id":"http://arxiv.org/abs/2309.04081v1","updated":"2023-09-08T02:42:40Z","published":"2023-09-08T02:42:40Z","title":"UER: A Heuristic Bias Addressing Approach for Online Continual Learning","summary":" Online continual learning aims to continuously train neural networks from a\ncontinuous data stream with a single pass-through data. As the most effective\napproach, the rehearsal-based methods replay part of previous data. Commonly\nused predictors in existing methods tend to generate biased dot-product logits\nthat prefer to the classes of current data, which is known as a bias issue and\na phenomenon of forgetting. Many approaches have been proposed to overcome the\nforgetting problem by correcting the bias; however, they still need to be\nimproved in online fashion. In this paper, we try to address the bias issue by\na more straightforward and more efficient method. By decomposing the\ndot-product logits into an angle factor and a norm factor, we empirically find\nthat the bias problem mainly occurs in the angle factor, which can be used to\nlearn novel knowledge as cosine logits. On the contrary, the norm factor\nabandoned by existing methods helps remember historical knowledge. Based on\nthis observation, we intuitively propose to leverage the norm factor to balance\nthe new and old knowledge for addressing the bias. To this end, we develop a\nheuristic approach called unbias experience replay (UER). UER learns current\nsamples only by the angle factor and further replays previous samples by both\nthe norm and angle factors. Extensive experiments on three datasets show that\nUER achieves superior performance over various state-of-the-art methods. The\ncode is in https://github.com/FelixHuiweiLin/UER.\n","authors":["Huiwei Lin","Shanshan Feng","Baoquan Zhang","Hongliang Qiao","Xutao Li","Yunming Ye"],"pdf_url":"https://arxiv.org/pdf/2309.04081v1.pdf","comment":"9 pages, 12 figures, ACM MM2023"},{"id":"http://arxiv.org/abs/2309.04071v1","updated":"2023-09-08T02:05:03Z","published":"2023-09-08T02:05:03Z","title":"Enhancing Hierarchical Transformers for Whole Brain Segmentation with\n Intracranial Measurements Integration","summary":" Whole brain segmentation with magnetic resonance imaging (MRI) enables the\nnon-invasive measurement of brain regions, including total intracranial volume\n(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain\nsegmentation methodology to incorporate intracranial measurements offers a\nheightened level of comprehensiveness in the analysis of brain structures.\nDespite its potential, the task of generalizing deep learning techniques for\nintracranial measurements faces data availability constraints due to limited\nmanually annotated atlases encompassing whole brain and TICV/PFV labels. In\nthis paper, we enhancing the hierarchical transformer UNesT for whole brain\nsegmentation to achieve segmenting whole brain with 133 classes and TICV/PFV\nsimultaneously. To address the problem of data scarcity, the model is first\npretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites.\nThese volumes are processed through a multi-atlas segmentation pipeline for\nlabel generation, while TICV/PFV labels are unavailable. Subsequently, the\nmodel is finetuned with 45 T1w 3D volumes from Open Access Series Imaging\nStudies (OASIS) where both 133 whole brain classes and TICV/PFV labels are\navailable. We evaluate our method with Dice similarity coefficients(DSC). We\nshow that our model is able to conduct precise TICV/PFV estimation while\nmaintaining the 132 brain regions performance at a comparable level. Code and\ntrained model are available at: https://github.com/MASILab/UNesT/wholebrainSeg.\n","authors":["Xin Yu","Yucheng Tang","Qi Yang","Ho Hin Lee","Shunxing Bao","Yuankai Huo","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2309.04071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.14378v2","updated":"2023-09-08T01:57:45Z","published":"2022-09-28T19:14:38Z","title":"UNesT: Local Spatial Representation Learning with Hierarchical\n Transformer for Efficient Medical Segmentation","summary":" Transformer-based models, capable of learning better global dependencies,\nhave recently demonstrated exceptional representation learning capabilities in\ncomputer vision and medical image analysis. Transformer reformats the image\ninto separate patches and realizes global communication via the self-attention\nmechanism. However, positional information between patches is hard to preserve\nin such 1D sequences, and loss of it can lead to sub-optimal performance when\ndealing with large amounts of heterogeneous tissues of various sizes in 3D\nmedical image segmentation. Additionally, current methods are not robust and\nefficient for heavy-duty medical segmentation tasks such as predicting a large\nnumber of tissue classes or modeling globally inter-connected tissue\nstructures. To address such challenges and inspired by the nested hierarchical\nstructures in vision transformer, we proposed a novel 3D medical image\nsegmentation method (UNesT), employing a simplified and faster-converging\ntransformer encoder design that achieves local communication among spatially\nadjacent patch sequences by aggregating them hierarchically. We extensively\nvalidate our method on multiple challenging datasets, consisting of multiple\nmodalities, anatomies, and a wide range of tissue classes, including 133\nstructures in the brain, 14 organs in the abdomen, 4 hierarchical components in\nthe kidneys, inter-connected kidney tumors and brain tumors. We show that UNesT\nconsistently achieves state-of-the-art performance and evaluate its\ngeneralizability and data efficiency. Particularly, the model achieves whole\nbrain segmentation task complete ROI with 133 tissue classes in a single\nnetwork, outperforming prior state-of-the-art method SLANT27 ensembled with 27\nnetworks.\n","authors":["Xin Yu","Qi Yang","Yinchi Zhou","Leon Y. Cai","Riqiang Gao","Ho Hin Lee","Thomas Li","Shunxing Bao","Zhoubing Xu","Thomas A. Lasko","Richard G. Abramson","Zizhao Zhang","Yuankai Huo","Bennett A. Landman","Yucheng Tang"],"pdf_url":"https://arxiv.org/pdf/2209.14378v2.pdf","comment":"19 pages, 17 figures. arXiv admin note: text overlap with\n arXiv:2203.02430"},{"id":"http://arxiv.org/abs/2309.04063v1","updated":"2023-09-08T01:41:35Z","published":"2023-09-08T01:41:35Z","title":"INSURE: An Information Theory Inspired Disentanglement and Purification\n Model for Domain Generalization","summary":" Domain Generalization (DG) aims to learn a generalizable model on the unseen\ntarget domain by only training on the multiple observed source domains.\nAlthough a variety of DG methods have focused on extracting domain-invariant\nfeatures, the domain-specific class-relevant features have attracted attention\nand been argued to benefit generalization to the unseen target domain. To take\ninto account the class-relevant domain-specific information, in this paper we\npropose an Information theory iNspired diSentanglement and pURification modEl\n(INSURE) to explicitly disentangle the latent features to obtain sufficient and\ncompact (necessary) class-relevant feature for generalization to the unseen\ndomain. Specifically, we first propose an information theory inspired loss\nfunction to ensure the disentangled class-relevant features contain sufficient\nclass label information and the other disentangled auxiliary feature has\nsufficient domain information. We further propose a paired purification loss\nfunction to let the auxiliary feature discard all the class-relevant\ninformation and thus the class-relevant feature will contain sufficient and\ncompact (necessary) class-relevant information. Moreover, instead of using\nmultiple encoders, we propose to use a learnable binary mask as our\ndisentangler to make the disentanglement more efficient and make the\ndisentangled features complementary to each other. We conduct extensive\nexperiments on four widely used DG benchmark datasets including PACS,\nOfficeHome, TerraIncognita, and DomainNet. The proposed INSURE outperforms the\nstate-of-art methods. We also empirically show that domain-specific\nclass-relevant features are beneficial for domain generalization.\n","authors":["Xi Yu","Huan-Hsin Tseng","Shinjae Yoo","Haibin Ling","Yuewei Lin"],"pdf_url":"https://arxiv.org/pdf/2309.04063v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.03744v2","updated":"2023-09-08T01:39:19Z","published":"2023-09-07T14:37:50Z","title":"Label-efficient Contrastive Learning-based model for nuclei detection\n and classification in 3D Cardiovascular Immunofluorescent Images","summary":" Recently, deep learning-based methods achieved promising performance in\nnuclei detection and classification applications. However, training deep\nlearning-based methods requires a large amount of pixel-wise annotated data,\nwhich is time-consuming and labor-intensive, especially in 3D images. An\nalternative approach is to adapt weak-annotation methods, such as labeling each\nnucleus with a point, but this method does not extend from 2D histopathology\nimages (for which it was originally developed) to 3D immunofluorescent images.\nThe reason is that 3D images contain multiple channels (z-axis) for nuclei and\ndifferent markers separately, which makes training using point annotations\ndifficult. To address this challenge, we propose the Label-efficient\nContrastive learning-based (LECL) model to detect and classify various types of\nnuclei in 3D immunofluorescent images. Previous methods use Maximum Intensity\nProjection (MIP) to convert immunofluorescent images with multiple slices to 2D\nimages, which can cause signals from different z-stacks to falsely appear\nassociated with each other. To overcome this, we devised an Extended Maximum\nIntensity Projection (EMIP) approach that addresses issues using MIP.\nFurthermore, we performed a Supervised Contrastive Learning (SCL) approach for\nweakly supervised settings. We conducted experiments on cardiovascular datasets\nand found that our proposed framework is effective and efficient in detecting\nand classifying various types of nuclei in 3D immunofluorescent images.\n","authors":["Nazanin Moradinasab","Rebecca A. Deaton","Laura S. Shankman","Gary K. Owens","Donald E. Brown"],"pdf_url":"https://arxiv.org/pdf/2309.03744v2.pdf","comment":"11 pages, 5 figures, MICCAI Workshop Conference 2023"},{"id":"http://arxiv.org/abs/2309.02185v2","updated":"2023-09-08T01:13:12Z","published":"2023-09-05T12:42:26Z","title":"BEVTrack: A Simple Baseline for 3D Single Object Tracking in\n Birds's-Eye-View","summary":" 3D single object tracking (SOT) in point clouds is still a challenging\nproblem due to appearance variation, distractors, and high sparsity of point\nclouds. Notably, in autonomous driving scenarios, the target object typically\nmaintains spatial adjacency across consecutive frames, predominantly moving\nhorizontally. This spatial continuity offers valuable prior knowledge for\ntarget localization. However, existing trackers, which often employ point-wise\nrepresentations, struggle to efficiently utilize this knowledge owing to the\nirregular format of such representations. Consequently, they require elaborate\ndesigns and solving multiple subtasks to establish spatial correspondence. In\nthis paper, we introduce BEVTrack, a simple yet strong baseline framework for\n3D SOT. After converting consecutive point clouds into the common\nBird's-Eye-View representation, BEVTrack inherently encodes spatial proximity\nand adeptly captures motion cues for tracking via a simple element-wise\noperation and convolutional layers. Additionally, to better deal with objects\nhaving diverse sizes and moving patterns, BEVTrack directly learns the\nunderlying motion distribution rather than making a fixed Laplacian or Gaussian\nassumption as in previous works. Without bells and whistles, BEVTrack achieves\nstate-of-the-art performance on KITTI and NuScenes datasets while maintaining a\nhigh inference speed of 122 FPS. The code will be released at\nhttps://github.com/xmm-prio/BEVTrack.\n","authors":["Yuxiang Yang","Yingqi Deng","Jiahao Nie","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02185v2.pdf","comment":"Technical report. Work in progress. The code will be released at\n https://github.com/xmm-prio/BEVTrack"},{"id":"http://arxiv.org/abs/2303.11444v2","updated":"2023-09-08T00:36:02Z","published":"2023-03-15T22:26:09Z","title":"Aerial Diffusion: Text Guided Ground-to-Aerial View Translation from a\n Single Image using Diffusion Models","summary":" We present a novel method, Aerial Diffusion, for generating aerial views from\na single ground-view image using text guidance. Aerial Diffusion leverages a\npretrained text-image diffusion model for prior knowledge. We address two main\nchallenges corresponding to domain gap between the ground-view and the aerial\nview and the two views being far apart in the text-image embedding manifold.\nOur approach uses a homography inspired by inverse perspective mapping prior to\nfinetuning the pretrained diffusion model. Additionally, using the text\ncorresponding to the ground-view to finetune the model helps us capture the\ndetails in the ground-view image at a relatively low bias towards the\nground-view image. Aerial Diffusion uses an alternating sampling strategy to\ncompute the optimal solution on complex high-dimensional manifold and generate\na high-fidelity (w.r.t. ground view) aerial image. We demonstrate the quality\nand versatility of Aerial Diffusion on a plethora of images from various\ndomains including nature, human actions, indoor scenes, etc. We qualitatively\nprove the effectiveness of our method with extensive ablations and comparisons.\nTo the best of our knowledge, Aerial Diffusion is the first approach that\nperforms ground-to-aerial translation in an unsupervised manner.\n","authors":["Divya Kothandaraman","Tianyi Zhou","Ming Lin","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2303.11444v2.pdf","comment":"Code: https://github.com/divyakraman/AerialDiffusion"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2303.09999v2","updated":"2023-09-08T10:37:30Z","published":"2023-03-17T14:20:34Z","title":"STIXnet: A Novel and Modular Solution for Extracting All STIX Objects in\n CTI Reports","summary":" The automatic extraction of information from Cyber Threat Intelligence (CTI)\nreports is crucial in risk management. The increased frequency of the\npublications of these reports has led researchers to develop new systems for\nautomatically recovering different types of entities and relations from textual\ndata. Most state-of-the-art models leverage Natural Language Processing (NLP)\ntechniques, which perform greatly in extracting a few types of entities at a\ntime but cannot detect heterogeneous data or their relations. Furthermore,\nseveral paradigms, such as STIX, have become de facto standards in the CTI\ncommunity and dictate a formal categorization of different entities and\nrelations to enable organizations to share data consistently. This paper\npresents STIXnet, the first solution for the automated extraction of all STIX\nentities and relationships in CTI reports. Through the use of NLP techniques\nand an interactive Knowledge Base (KB) of entities, our approach obtains F1\nscores comparable to state-of-the-art models for entity extraction (0.916) and\nrelation extraction (0.724) while considering significantly more types of\nentities and relations. Moreover, STIXnet constitutes a modular and extensible\nframework that manages and coordinates different modules to merge their\ncontributions uniquely and exhaustively. With our approach, researchers and\norganizations can extend their Information Extraction (IE) capabilities by\nintegrating the efforts of several techniques without needing to develop new\ntools from scratch.\n","authors":["Francesco Marchiori","Mauro Conti","Nino Vincenzo Verde"],"pdf_url":"https://arxiv.org/pdf/2303.09999v2.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.04250v1","updated":"2023-09-08T10:28:41Z","published":"2023-09-08T10:28:41Z","title":"Provider Fairness and Beyond-Accuracy Trade-offs in Recommender Systems","summary":" Recommender systems, while transformative in online user experiences, have\nraised concerns over potential provider-side fairness issues. These systems may\ninadvertently favor popular items, thereby marginalizing less popular ones and\ncompromising provider fairness. While previous research has recognized\nprovider-side fairness issues, the investigation into how these biases affect\nbeyond-accuracy aspects of recommendation systems - such as diversity, novelty,\ncoverage, and serendipity - has been less emphasized. In this paper, we address\nthis gap by introducing a simple yet effective post-processing re-ranking model\nthat prioritizes provider fairness, while simultaneously maintaining user\nrelevance and recommendation quality. We then conduct an in-depth evaluation of\nthe model's impact on various aspects of recommendation quality across multiple\ndatasets. Specifically, we apply the post-processing algorithm to four distinct\nrecommendation models across four varied domain datasets, assessing the\nimprovement in each metric, encompassing both accuracy and beyond-accuracy\naspects. This comprehensive analysis allows us to gauge the effectiveness of\nour approach in mitigating provider biases. Our findings underscore the\neffectiveness of the adopted method in improving provider fairness and\nrecommendation quality. They also provide valuable insights into the trade-offs\ninvolved in achieving fairness in recommender systems, contributing to a more\nnuanced understanding of this complex issue.\n","authors":["Saeedeh Karimi","Hossein A. Rahmani","Mohammadmehdi Naghiaei","Leila Safari"],"pdf_url":"https://arxiv.org/pdf/2309.04250v1.pdf","comment":"FAccTRec at RecSys 2023"},{"id":"http://arxiv.org/abs/2309.04222v1","updated":"2023-09-08T09:11:26Z","published":"2023-09-08T09:11:26Z","title":"Offline Recommender System Evaluation under Unobserved Confounding","summary":" Off-Policy Estimation (OPE) methods allow us to learn and evaluate\ndecision-making policies from logged data. This makes them an attractive choice\nfor the offline evaluation of recommender systems, and several recent works\nhave reported successful adoption of OPE methods to this end. An important\nassumption that makes this work is the absence of unobserved confounders:\nrandom variables that influence both actions and rewards at data collection\ntime. Because the data collection policy is typically under the practitioner's\ncontrol, the unconfoundedness assumption is often left implicit, and its\nviolations are rarely dealt with in the existing literature.\n This work aims to highlight the problems that arise when performing\noff-policy estimation in the presence of unobserved confounders, specifically\nfocusing on a recommendation use-case. We focus on policy-based estimators,\nwhere the logging propensities are learned from logged data. We characterise\nthe statistical bias that arises due to confounding, and show how existing\ndiagnostics are unable to uncover such cases. Because the bias depends directly\non the true and unobserved logging propensities, it is non-identifiable. As the\nunconfoundedness assumption is famously untestable, this becomes especially\nproblematic. This paper emphasises this common, yet often overlooked issue.\nThrough synthetic data, we empirically show how na\\\"ive propensity estimation\nunder confounding can lead to severely biased metric estimates that are allowed\nto fly under the radar. We aim to cultivate an awareness among researchers and\npractitioners of this important problem, and touch upon potential research\ndirections towards mitigating its effects.\n","authors":["Olivier Jeunen","Ben London"],"pdf_url":"https://arxiv.org/pdf/2309.04222v1.pdf","comment":"Accepted at the CONSEQUENCES'23 workshop at RecSys '23"},{"id":"http://arxiv.org/abs/2309.04184v1","updated":"2023-09-08T07:54:11Z","published":"2023-09-08T07:54:11Z","title":"Receiving an algorithmic recommendation based on documentary filmmaking\n techniques","summary":" This article analyzes the reception of a novel algorithmic recommendation of\ndocumentary films by a panel of moviegoers of the T{\\\"e}nk platform. In order\nto propose an alternative to recommendations based on a thematic\nclassification, the director or the production period, a set of metadata has\nbeen elaborated within the framework of this experimentation in order to\ncharacterize the great variety of ``documentary filmmaking dispositifs'' . The\ngoal is to investigate the different ways in which the platform's film lovers\nappropriate a personalized recommendation of 4 documentaries with similar or\nsimilar filmmaking dispositifs. To conclude, the contributions and limits of\nthis proof of concept are discussed in order to sketch out avenues of\nreflection for improving the instrumented mediation of documentary films.\n","authors":["Samuel Gantier","Ève Givois","Bernard Jacquemin","Bouchra Atbane-El Houadi"],"pdf_url":"https://arxiv.org/pdf/2309.04184v1.pdf","comment":"in French language"},{"id":"http://arxiv.org/abs/2309.04182v1","updated":"2023-09-08T07:53:21Z","published":"2023-09-08T07:53:21Z","title":"A Long-Tail Friendly Representation Framework for Artist and Music\n Similarity","summary":" The investigation of the similarity between artists and music is crucial in\nmusic retrieval and recommendation, and addressing the challenge of the\nlong-tail phenomenon is increasingly important. This paper proposes a Long-Tail\nFriendly Representation Framework (LTFRF) that utilizes neural networks to\nmodel the similarity relationship. Our approach integrates music, user,\nmetadata, and relationship data into a unified metric learning framework, and\nemploys a meta-consistency relationship as a regular term to introduce the\nMulti-Relationship Loss. Compared to the Graph Neural Network (GNN), our\nproposed framework improves the representation performance in long-tail\nscenarios, which are characterized by sparse relationships between artists and\nmusic. We conduct experiments and analysis on the AllMusic dataset, and the\nresults demonstrate that our framework provides a favorable generalization of\nartist and music representation. Specifically, on similar artist/music\nrecommendation tasks, the LTFRF outperforms the baseline by 9.69%/19.42% in Hit\nRatio@10, and in long-tail cases, the framework achieves 11.05%/14.14% higher\nthan the baseline in Consistent@10.\n","authors":["Haoran Xiang","Junyu Dai","Xuchen Song","Furao Shen"],"pdf_url":"https://arxiv.org/pdf/2309.04182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04171v1","updated":"2023-09-08T07:37:15Z","published":"2023-09-08T07:37:15Z","title":"PRISTA-Net: Deep Iterative Shrinkage Thresholding Network for Coded\n Diffraction Patterns Phase Retrieval","summary":" The problem of phase retrieval (PR) involves recovering an unknown image from\nlimited amplitude measurement data and is a challenge nonlinear inverse problem\nin computational imaging and image processing. However, many of the PR methods\nare based on black-box network models that lack interpretability and\nplug-and-play (PnP) frameworks that are computationally complex and require\ncareful parameter tuning. To address this, we have developed PRISTA-Net, a deep\nunfolding network (DUN) based on the first-order iterative shrinkage\nthresholding algorithm (ISTA). This network utilizes a learnable nonlinear\ntransformation to address the proximal-point mapping sub-problem associated\nwith the sparse priors, and an attention mechanism to focus on phase\ninformation containing image edges, textures, and structures. Additionally, the\nfast Fourier transform (FFT) is used to learn global features to enhance local\ninformation, and the designed logarithmic-based loss function leads to\nsignificant improvements when the noise level is low. All parameters in the\nproposed PRISTA-Net framework, including the nonlinear transformation,\nthreshold parameters, and step size, are learned end-to-end instead of being\nmanually set. This method combines the interpretability of traditional methods\nwith the fast inference ability of deep learning and is able to handle noise at\neach iteration during the unfolding stage, thus improving recovery quality.\nExperiments on Coded Diffraction Patterns (CDPs) measurements demonstrate that\nour approach outperforms the existing state-of-the-art methods in terms of\nqualitative and quantitative evaluations. Our source codes are available at\n\\emph{https://github.com/liuaxou/PRISTA-Net}.\n","authors":["Aoxu Liu","Xiaohong Fan","Yin Yang","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04171v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2309.03518v2","updated":"2023-09-08T03:56:01Z","published":"2023-09-07T06:58:34Z","title":"Learning Compact Compositional Embeddings via Regularized Pruning for\n Recommendation","summary":" Latent factor models are the dominant backbones of contemporary recommender\nsystems (RSs) given their performance advantages, where a unique vector\nembedding with a fixed dimensionality (e.g., 128) is required to represent each\nentity (commonly a user/item). Due to the large number of users and items on\ne-commerce sites, the embedding table is arguably the least memory-efficient\ncomponent of RSs. For any lightweight recommender that aims to efficiently\nscale with the growing size of users/items or to remain applicable in\nresource-constrained settings, existing solutions either reduce the number of\nembeddings needed via hashing, or sparsify the full embedding table to switch\noff selected embedding dimensions. However, as hash collision arises or\nembeddings become overly sparse, especially when adapting to a tighter memory\nbudget, those lightweight recommenders inevitably have to compromise their\naccuracy. To this end, we propose a novel compact embedding framework for RSs,\nnamely Compositional Embedding with Regularized Pruning (CERP). Specifically,\nCERP represents each entity by combining a pair of embeddings from two\nindependent, substantially smaller meta-embedding tables, which are then\njointly pruned via a learnable element-wise threshold. In addition, we\ninnovatively design a regularized pruning mechanism in CERP, such that the two\nsparsified meta-embedding tables are encouraged to encode information that is\nmutually complementary. Given the compatibility with agnostic latent factor\nmodels, we pair CERP with two popular recommendation models for extensive\nexperiments, where results on two real-world datasets under different memory\nbudgets demonstrate its superiority against state-of-the-art baselines. The\ncodebase of CERP is available in https://github.com/xurong-liang/CERP.\n","authors":["Xurong Liang","Tong Chen","Quoc Viet Hung Nguyen","Jianxin Li","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2309.03518v2.pdf","comment":"Accepted by ICDM'23"},{"id":"http://arxiv.org/abs/2309.05671v1","updated":"2023-09-08T17:47:31Z","published":"2023-09-08T17:47:31Z","title":"tSPM+; a high-performance algorithm for mining transitive sequential\n patterns from clinical data","summary":" The increasing availability of large clinical datasets collected from\npatients can enable new avenues for computational characterization of complex\ndiseases using different analytic algorithms. One of the promising new methods\nfor extracting knowledge from large clinical datasets involves temporal pattern\nmining integrated with machine learning workflows. However, mining these\ntemporal patterns is a computational intensive task and has memory\nrepercussions. Current algorithms, such as the temporal sequence pattern mining\n(tSPM) algorithm, are already providing promising outcomes, but still leave\nroom for optimization. In this paper, we present the tSPM+ algorithm, a\nhigh-performance implementation of the tSPM algorithm, which adds a new\ndimension by adding the duration to the temporal patterns. We show that the\ntSPM+ algorithm provides a speed up to factor 980 and a up to 48 fold\nimprovement in memory consumption. Moreover, we present a docker container with\nan R-package, We also provide vignettes for an easy integration into already\nexisting machine learning workflows and use the mined temporal sequences to\nidentify Post COVID-19 patients and their symptoms according to the WHO\ndefinition.\n","authors":["Jonas Hügel","Ulrich Sax","Shawn N. Murphy","Hossein Estiri"],"pdf_url":"https://arxiv.org/pdf/2309.05671v1.pdf","comment":"Supplementary data: https://doi.org/10.5281/zenodo.8329519"},{"id":"http://arxiv.org/abs/2309.06375v1","updated":"2023-09-08T03:20:58Z","published":"2023-09-08T03:20:58Z","title":"Modeling Recommender Ecosystems: Research Challenges at the Intersection\n of Mechanism Design, Reinforcement Learning and Generative Models","summary":" Modern recommender systems lie at the heart of complex ecosystems that couple\nthe behavior of users, content providers, advertisers, and other actors.\nDespite this, the focus of the majority of recommender research -- and most\npractical recommenders of any import -- is on the local, myopic optimization of\nthe recommendations made to individual users. This comes at a significant cost\nto the long-term utility that recommenders could generate for its users. We\nargue that explicitly modeling the incentives and behaviors of all actors in\nthe system -- and the interactions among them induced by the recommender's\npolicy -- is strictly necessary if one is to maximize the value the system\nbrings to these actors and improve overall ecosystem \"health\". Doing so\nrequires: optimization over long horizons using techniques such as\nreinforcement learning; making inevitable tradeoffs in the utility that can be\ngenerated for different actors using the methods of social choice; reducing\ninformation asymmetry, while accounting for incentives and strategic behavior,\nusing the tools of mechanism design; better modeling of both user and\nitem-provider behaviors by incorporating notions from behavioral economics and\npsychology; and exploiting recent advances in generative and foundation models\nto make these mechanisms interpretable and actionable. We propose a conceptual\nframework that encompasses these elements, and articulate a number of research\nchallenges that emerge at the intersection of these different disciplines.\n","authors":["Craig Boutilier","Martin Mladenov","Guy Tennenholtz"],"pdf_url":"https://arxiv.org/pdf/2309.06375v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.04470v1","updated":"2023-09-08T17:57:31Z","published":"2023-09-08T17:57:31Z","title":"On the Actionability of Outcome Prediction","summary":" Predicting future outcomes is a prevalent application of machine learning in\nsocial impact domains. Examples range from predicting student success in\neducation to predicting disease risk in healthcare. Practitioners recognize\nthat the ultimate goal is not just to predict but to act effectively.\nIncreasing evidence suggests that relying on outcome predictions for downstream\ninterventions may not have desired results.\n In most domains there exists a multitude of possible interventions for each\nindividual, making the challenge of taking effective action more acute. Even\nwhen causal mechanisms connecting the individual's latent states to outcomes is\nwell understood, in any given instance (a specific student or patient),\npractitioners still need to infer -- from budgeted measurements of latent\nstates -- which of many possible interventions will be most effective for this\nindividual. With this in mind, we ask: when are accurate predictors of outcomes\nhelpful for identifying the most suitable intervention?\n Through a simple model encompassing actions, latent states, and measurements,\nwe demonstrate that pure outcome prediction rarely results in the most\neffective policy for taking actions, even when combined with other\nmeasurements. We find that except in cases where there is a single decisive\naction for improving the outcome, outcome prediction never maximizes \"action\nvalue\", the utility of taking actions. Making measurements of actionable latent\nstates, where specific actions lead to desired outcomes, considerably enhances\nthe action value compared to outcome prediction, and the degree of improvement\ndepends on action costs and the outcome model. This analysis emphasizes the\nneed to go beyond generic outcome prediction in interventional settings by\nincorporating knowledge of plausible actions and latent states.\n","authors":["Lydia T. Liu","Solon Barocas","Jon Kleinberg","Karen Levy"],"pdf_url":"https://arxiv.org/pdf/2309.04470v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.04461v1","updated":"2023-09-08T17:49:44Z","published":"2023-09-08T17:49:44Z","title":"Measuring and Improving Chain-of-Thought Reasoning in Vision-Language\n Models","summary":" Vision-language models (VLMs) have recently demonstrated strong efficacy as\nvisual assistants that can parse natural queries about the visual content and\ngenerate human-like outputs. In this work, we explore the ability of these\nmodels to demonstrate human-like reasoning based on the perceived information.\nTo address a crucial concern regarding the extent to which their reasoning\ncapabilities are fully consistent and grounded, we also measure the reasoning\nconsistency of these models. We achieve this by proposing a chain-of-thought\n(CoT) based consistency measure. However, such an evaluation requires a\nbenchmark that encompasses both high-level inference and detailed reasoning\nchains, which is costly. We tackle this challenge by proposing a\nLLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously\nensuring the generation of a high-quality dataset. Based on this pipeline and\nthe existing coarse-grained annotated dataset, we build the CURE benchmark to\nmeasure both the zero-shot reasoning performance and consistency of VLMs. We\nevaluate existing state-of-the-art VLMs, and find that even the best-performing\nmodel is unable to demonstrate strong visual reasoning capabilities and\nconsistency, indicating that substantial efforts are required to enable VLMs to\nperform visual reasoning as systematically and consistently as humans. As an\nearly step, we propose a two-stage training framework aimed at improving both\nthe reasoning performance and consistency of VLMs. The first stage involves\nemploying supervised fine-tuning of VLMs using step-by-step reasoning samples\nautomatically generated by LLMs. In the second stage, we further augment the\ntraining process by incorporating feedback provided by LLMs to produce\nreasoning chains that are highly consistent and grounded. We empirically\nhighlight the effectiveness of our framework in both reasoning performance and\nconsistency.\n","authors":["Yangyi Chen","Karan Sikka","Michael Cogswell","Heng Ji","Ajay Divakaran"],"pdf_url":"https://arxiv.org/pdf/2309.04461v1.pdf","comment":"The data is released at\n \\url{https://github.com/Yangyi-Chen/CoTConsistency}"},{"id":"http://arxiv.org/abs/2309.04459v1","updated":"2023-09-08T17:37:05Z","published":"2023-09-08T17:37:05Z","title":"Subwords as Skills: Tokenization for Sparse-Reward Reinforcement\n Learning","summary":" Exploration in sparse-reward reinforcement learning is difficult due to the\nrequirement of long, coordinated sequences of actions in order to achieve any\nreward. Moreover, in continuous action spaces there are an infinite number of\npossible actions, which only increases the difficulty of exploration. One class\nof methods designed to address these issues forms temporally extended actions,\noften called skills, from interaction data collected in the same domain, and\noptimizes a policy on top of this new action space. Typically such methods\nrequire a lengthy pretraining phase, especially in continuous action spaces, in\norder to form the skills before reinforcement learning can begin. Given prior\nevidence that the full range of the continuous action space is not required in\nsuch tasks, we propose a novel approach to skill-generation with two\ncomponents. First we discretize the action space through clustering, and second\nwe leverage a tokenization technique borrowed from natural language processing\nto generate temporally extended actions. Such a method outperforms baselines\nfor skill-generation in several challenging sparse-reward domains, and requires\norders-of-magnitude less computation in skill-generation and online rollouts.\n","authors":["David Yunis","Justin Jung","Falcon Dai","Matthew Walter"],"pdf_url":"https://arxiv.org/pdf/2309.04459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00163v2","updated":"2023-09-08T17:33:12Z","published":"2023-03-31T22:50:47Z","title":"Soft-Bellman Equilibrium in Affine Markov Games: Forward Solutions and\n Inverse Learning","summary":" Markov games model interactions among multiple players in a stochastic,\ndynamic environment. Each player in a Markov game maximizes its expected total\ndiscounted reward, which depends upon the policies of the other players. We\nformulate a class of Markov games, termed affine Markov games, where an affine\nreward function couples the players' actions. We introduce a novel solution\nconcept, the soft-Bellman equilibrium, where each player is boundedly rational\nand chooses a soft-Bellman policy rather than a purely rational policy as in\nthe well-known Nash equilibrium concept. We provide conditions for the\nexistence and uniqueness of the soft-Bellman equilibrium and propose a\nnonlinear least-squares algorithm to compute such an equilibrium in the forward\nproblem. We then solve the inverse game problem of inferring the players'\nreward parameters from observed state-action trajectories via a\nprojected-gradient algorithm. Experiments in a predator-prey OpenAI Gym\nenvironment show that the reward parameters inferred by the proposed algorithm\noutperform those inferred by a baseline algorithm: they reduce the\nKullback-Leibler divergence between the equilibrium policies and observed\npolicies by at least two orders of magnitude.\n","authors":["Shenghui Chen","Yue Yu","David Fridovich-Keil","Ufuk Topcu"],"pdf_url":"https://arxiv.org/pdf/2304.00163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04452v1","updated":"2023-09-08T17:20:51Z","published":"2023-09-08T17:20:51Z","title":"Postprocessing of Ensemble Weather Forecasts Using Permutation-invariant\n Neural Networks","summary":" Statistical postprocessing is used to translate ensembles of raw numerical\nweather forecasts into reliable probabilistic forecast distributions. In this\nstudy, we examine the use of permutation-invariant neural networks for this\ntask. In contrast to previous approaches, which often operate on ensemble\nsummary statistics and dismiss details of the ensemble distribution, we propose\nnetworks which treat forecast ensembles as a set of unordered member forecasts\nand learn link functions that are by design invariant to permutations of the\nmember ordering. We evaluate the quality of the obtained forecast distributions\nin terms of calibration and sharpness, and compare the models against classical\nand neural network-based benchmark methods. In case studies addressing the\npostprocessing of surface temperature and wind gust forecasts, we demonstrate\nstate-of-the-art prediction quality. To deepen the understanding of the learned\ninference process, we further propose a permutation-based importance analysis\nfor ensemble-valued predictors, which highlights specific aspects of the\nensemble forecast that are considered important by the trained postprocessing\nmodels. Our results suggest that most of the relevant information is contained\nin few ensemble-internal degrees of freedom, which may impact the design of\nfuture ensemble forecasting and postprocessing systems.\n","authors":["Kevin Höhlein","Benedikt Schulz","Rüdiger Westermann","Sebastian Lerch"],"pdf_url":"https://arxiv.org/pdf/2309.04452v1.pdf","comment":"Submitted to Artificial Intelligence for the Earth Systems"},{"id":"http://arxiv.org/abs/2309.01825v2","updated":"2023-09-08T17:06:36Z","published":"2023-09-04T21:30:15Z","title":"LoopTune: Optimizing Tensor Computations with Reinforcement Learning","summary":" Advanced compiler technology is crucial for enabling machine learning\napplications to run on novel hardware, but traditional compilers fail to\ndeliver performance, popular auto-tuners have long search times and\nexpert-optimized libraries introduce unsustainable costs. To address this, we\ndeveloped LoopTune, a deep reinforcement learning compiler that optimizes\ntensor computations in deep learning models for the CPU. LoopTune optimizes\ntensor traversal order while using the ultra-fast lightweight code generator\nLoopNest to perform hardware-specific optimizations. With a novel graph-based\nrepresentation and action space, LoopTune speeds up LoopNest by 3.2x,\ngenerating an order of magnitude faster code than TVM, 2.8x faster than\nMetaSchedule, and 1.08x faster than AutoTVM, consistently performing at the\nlevel of the hand-tuned library Numpy. Moreover, LoopTune tunes code in order\nof seconds.\n","authors":["Dejan Grubisic","Bram Wasti","Chris Cummins","John Mellor-Crummey","Aleksandar Zlateski"],"pdf_url":"https://arxiv.org/pdf/2309.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04433v1","updated":"2023-09-08T16:55:23Z","published":"2023-09-08T16:55:23Z","title":"Variations and Relaxations of Normalizing Flows","summary":" Normalizing Flows (NFs) describe a class of models that express a complex\ntarget distribution as the composition of a series of bijective transformations\nover a simpler base distribution. By limiting the space of candidate\ntransformations to diffeomorphisms, NFs enjoy efficient, exact sampling and\ndensity evaluation, enabling NFs to flexibly behave as both discriminative and\ngenerative models. Their restriction to diffeomorphisms, however, enforces that\ninput, output and all intermediary spaces share the same dimension, limiting\ntheir ability to effectively represent target distributions with complex\ntopologies. Additionally, in cases where the prior and target distributions are\nnot homeomorphic, Normalizing Flows can leak mass outside of the support of the\ntarget. This survey covers a selection of recent works that combine aspects of\nother generative model classes, such as VAEs and score-based diffusion, and in\ndoing so loosen the strict bijectivity constraints of NFs to achieve a balance\nof expressivity, training speed, sample efficiency and likelihood tractability.\n","authors":["Keegan Kelly","Lorena Piedras","Sukrit Rao","David Roth"],"pdf_url":"https://arxiv.org/pdf/2309.04433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04428v1","updated":"2023-09-08T16:41:26Z","published":"2023-09-08T16:41:26Z","title":"Soft Quantization using Entropic Regularization","summary":" The quantization problem aims to find the best possible approximation of\nprobability measures on ${\\mathbb{R}}^d$ using finite, discrete measures. The\nWasserstein distance is a typical choice to measure the quality of the\napproximation. This contribution investigates the properties and robustness of\nthe entropy-regularized quantization problem, which relaxes the standard\nquantization problem. The proposed approximation technique naturally adopts the\nsoftmin function, which is well known for its robustness in terms of\ntheoretical and practicability standpoints. Moreover, we use the\nentropy-regularized Wasserstein distance to evaluate the quality of the soft\nquantization problem's approximation, and we implement a stochastic gradient\napproach to achieve the optimal solutions. The control parameter in our\nproposed method allows for the adjustment of the optimization problem's\ndifficulty level, providing significant advantages when dealing with\nexceptionally challenging problems of interest. As well, this contribution\nempirically illustrates the performance of the method in various expositions.\n","authors":["Rajmadan Lakshmanan","Alois Pichler"],"pdf_url":"https://arxiv.org/pdf/2309.04428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04427v1","updated":"2023-09-08T16:41:25Z","published":"2023-09-08T16:41:25Z","title":"Robust Representation Learning for Privacy-Preserving Machine Learning:\n A Multi-Objective Autoencoder Approach","summary":" Several domains increasingly rely on machine learning in their applications.\nThe resulting heavy dependence on data has led to the emergence of various laws\nand regulations around data ethics and privacy and growing awareness of the\nneed for privacy-preserving machine learning (ppML). Current ppML techniques\nutilize methods that are either purely based on cryptography, such as\nhomomorphic encryption, or that introduce noise into the input, such as\ndifferential privacy. The main criticism given to those techniques is the fact\nthat they either are too slow or they trade off a model s performance for\nimproved confidentiality. To address this performance reduction, we aim to\nleverage robust representation learning as a way of encoding our data while\noptimizing the privacy-utility trade-off. Our method centers on training\nautoencoders in a multi-objective manner and then concatenating the latent and\nlearned features from the encoding part as the encoded form of our data. Such a\ndeep learning-powered encoding can then safely be sent to a third party for\nintensive training and hyperparameter tuning. With our proposed framework, we\ncan share our data and use third party tools without being under the threat of\nrevealing its original form. We empirically validate our results on unimodal\nand multimodal settings, the latter following a vertical splitting system and\nshow improved performance over state-of-the-art.\n","authors":["Sofiane Ouaari","Ali Burak Ünal","Mete Akgün","Nico Pfeifer"],"pdf_url":"https://arxiv.org/pdf/2309.04427v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1911.00567v7","updated":"2023-09-08T16:34:55Z","published":"2019-11-01T19:48:57Z","title":"Frequentist Regret Bounds for Randomized Least-Squares Value Iteration","summary":" We consider the exploration-exploitation dilemma in finite-horizon\nreinforcement learning (RL). When the state space is large or continuous,\ntraditional tabular approaches are unfeasible and some form of function\napproximation is mandatory. In this paper, we introduce an\noptimistically-initialized variant of the popular randomized least-squares\nvalue iteration (RLSVI), a model-free algorithm where exploration is induced by\nperturbing the least-squares approximation of the action-value function. Under\nthe assumption that the Markov decision process has low-rank transition\ndynamics, we prove that the frequentist regret of RLSVI is upper-bounded by\n$\\widetilde O(d^2 H^2 \\sqrt{T})$ where $ d $ are the feature dimension, $ H $\nis the horizon, and $ T $ is the total number of steps. To the best of our\nknowledge, this is the first frequentist regret analysis for randomized\nexploration with function approximation.\n","authors":["Andrea Zanette","David Brandfonbrener","Emma Brunskill","Matteo Pirotta","Alessandro Lazaric"],"pdf_url":"https://arxiv.org/pdf/1911.00567v7.pdf","comment":"Minor bug fixes"},{"id":"http://arxiv.org/abs/2309.04420v1","updated":"2023-09-08T16:32:47Z","published":"2023-09-08T16:32:47Z","title":"Parallel and Limited Data Voice Conversion Using Stochastic Variational\n Deep Kernel Learning","summary":" Typically, voice conversion is regarded as an engineering problem with\nlimited training data. The reliance on massive amounts of data hinders the\npractical applicability of deep learning approaches, which have been\nextensively researched in recent years. On the other hand, statistical methods\nare effective with limited data but have difficulties in modelling complex\nmapping functions. This paper proposes a voice conversion method that works\nwith limited data and is based on stochastic variational deep kernel learning\n(SVDKL). At the same time, SVDKL enables the use of deep neural networks'\nexpressive capability as well as the high flexibility of the Gaussian process\nas a Bayesian and non-parametric method. When the conventional kernel is\ncombined with the deep neural network, it is possible to estimate non-smooth\nand more complex functions. Furthermore, the model's sparse variational\nGaussian process solves the scalability problem and, unlike the exact Gaussian\nprocess, allows for the learning of a global mapping function for the entire\nacoustic space. One of the most important aspects of the proposed scheme is\nthat the model parameters are trained using marginal likelihood optimization,\nwhich considers both data fitting and model complexity. Considering the\ncomplexity of the model reduces the amount of training data by increasing the\nresistance to overfitting. To evaluate the proposed scheme, we examined the\nmodel's performance with approximately 80 seconds of training data. The results\nindicated that our method obtained a higher mean opinion score, smaller\nspectral distortion, and better preference tests than the compared methods.\n","authors":["Mohamadreza Jafaryani","Hamid Sheikhzadeh","Vahid Pourahmadi"],"pdf_url":"https://arxiv.org/pdf/2309.04420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04382v1","updated":"2023-09-08T15:24:55Z","published":"2023-09-08T15:24:55Z","title":"Emergent learning in physical systems as feedback-based aging in a\n glassy landscape","summary":" By training linear physical networks to learn linear transformations, we\ndiscern how their physical properties evolve due to weight update rules. Our\nfindings highlight a striking similarity between the learning behaviors of such\nnetworks and the processes of aging and memory formation in disordered and\nglassy systems. We show that the learning dynamics resembles an aging process,\nwhere the system relaxes in response to repeated application of the feedback\nboundary forces in presence of an input force, thus encoding a memory of the\ninput-output relationship. With this relaxation comes an increase in the\ncorrelation length, which is indicated by the two-point correlation function\nfor the components of the network. We also observe that the square root of the\nmean-squared error as a function of epoch takes on a non-exponential form,\nwhich is a typical feature of glassy systems. This physical interpretation\nsuggests that by encoding more detailed information into input and feedback\nboundary forces, the process of emergent learning can be rather ubiquitous and,\nthus, serve as a very early physical mechanism, from an evolutionary\nstandpoint, for learning in biological systems.\n","authors":["Vidyesh Rao Anisetti","Ananth Kandala","J. M. Schwarz"],"pdf_url":"https://arxiv.org/pdf/2309.04382v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.04381v1","updated":"2023-09-08T15:23:40Z","published":"2023-09-08T15:23:40Z","title":"Generalization Bounds: Perspectives from Information Theory and\n PAC-Bayes","summary":" A fundamental question in theoretical machine learning is generalization.\nOver the past decades, the PAC-Bayesian approach has been established as a\nflexible framework to address the generalization capabilities of machine\nlearning algorithms, and design new ones. Recently, it has garnered increased\ninterest due to its potential applicability for a variety of learning\nalgorithms, including deep neural networks. In parallel, an\ninformation-theoretic view of generalization has developed, wherein the\nrelation between generalization and various information measures has been\nestablished. This framework is intimately connected to the PAC-Bayesian\napproach, and a number of results have been independently discovered in both\nstrands. In this monograph, we highlight this strong connection and present a\nunified treatment of generalization. We present techniques and results that the\ntwo perspectives have in common, and discuss the approaches and interpretations\nthat differ. In particular, we demonstrate how many proofs in the area share a\nmodular structure, through which the underlying ideas can be intuited. We pay\nspecial attention to the conditional mutual information (CMI) framework;\nanalytical studies of the information complexity of learning algorithms; and\nthe application of the proposed methods to deep learning. This monograph is\nintended to provide a comprehensive introduction to information-theoretic\ngeneralization bounds and their connection to PAC-Bayes, serving as a\nfoundation from which the most recent developments are accessible. It is aimed\nbroadly towards researchers with an interest in generalization and theoretical\nmachine learning.\n","authors":["Fredrik Hellström","Giuseppe Durisi","Benjamin Guedj","Maxim Raginsky"],"pdf_url":"https://arxiv.org/pdf/2309.04381v1.pdf","comment":"222 pages"},{"id":"http://arxiv.org/abs/2309.04370v1","updated":"2023-09-08T15:02:46Z","published":"2023-09-08T15:02:46Z","title":"Seeing-Eye Quadruped Navigation with Force Responsive Locomotion Control","summary":" Seeing-eye robots are very useful tools for guiding visually impaired people,\npotentially producing a huge societal impact given the low availability and\nhigh cost of real guide dogs. Although a few seeing-eye robot systems have\nalready been demonstrated, none considered external tugs from humans, which\nfrequently occur in a real guide dog setting. In this paper, we simultaneously\ntrain a locomotion controller that is robust to external tugging forces via\nReinforcement Learning (RL), and an external force estimator via supervised\nlearning. The controller ensures stable walking, and the force estimator\nenables the robot to respond to the external forces from the human. These\nforces are used to guide the robot to the global goal, which is unknown to the\nrobot, while the robot guides the human around nearby obstacles via a local\nplanner. Experimental results in simulation and on hardware show that our\ncontroller is robust to external forces, and our seeing-eye system can\naccurately detect force direction. We demonstrate our full seeing-eye robot\nsystem on a real quadruped robot with a blindfolded human. The video can be\nseen at our project page: https://bu-air-lab.github.io/guide_dog/\n","authors":["David DeFazio","Eisuke Hirota","Shiqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04370v1.pdf","comment":"Accepted to CoRL 2023"},{"id":"http://arxiv.org/abs/2306.01726v3","updated":"2023-09-08T14:56:36Z","published":"2023-06-02T17:52:59Z","title":"Streaming algorithms for evaluating noisy judges on unlabeled data --\n binary classification","summary":" The evaluation of noisy binary classifiers on unlabeled data is treated as a\nstreaming task: given a data sketch of the decisions by an ensemble, estimate\nthe true prevalence of the labels as well as each classifier's accuracy on\nthem. Two fully algebraic evaluators are constructed to do this. Both are based\non the assumption that the classifiers make independent errors. The first is\nbased on majority voting. The second, the main contribution of the paper, is\nguaranteed to be correct. But how do we know the classifiers are independent on\nany given test? This principal/agent monitoring paradox is ameliorated by\nexploiting the failures of the independent evaluator to return sensible\nestimates. A search for nearly error independent trios is empirically carried\nout on the \\texttt{adult}, \\texttt{mushroom}, and \\texttt{two-norm} datasets by\nusing the algebraic failure modes to reject evaluation ensembles as too\ncorrelated. The searches are refined by constructing a surface in evaluation\nspace that contains the true value point. The algebra of arbitrarily correlated\nclassifiers permits the selection of a polynomial subset free of any\ncorrelation variables. Candidate evaluation ensembles are rejected if their\ndata sketches produce independent estimates too far from the constructed\nsurface. The results produced by the surviving ensembles can sometimes be as\ngood as 1\\%. But handling even small amounts of correlation remains a\nchallenge. A Taylor expansion of the estimates produced when independence is\nassumed but the classifiers are, in fact, slightly correlated helps clarify how\nthe independent evaluator has algebraic `blind spots'.\n","authors":["Andrés Corrada-Emmanuel"],"pdf_url":"https://arxiv.org/pdf/2306.01726v3.pdf","comment":"25 pages, 5 figures. Added extensive discussion about the Platanios\n agreement equations and how the independent solution from them is not correct"},{"id":"http://arxiv.org/abs/2309.04367v1","updated":"2023-09-08T14:56:22Z","published":"2023-09-08T14:56:22Z","title":"Active Learning for Classifying 2D Grid-Based Level Completability","summary":" Determining the completability of levels generated by procedural generators\nsuch as machine learning models can be challenging, as it can involve the use\nof solver agents that often require a significant amount of time to analyze and\nsolve levels. Active learning is not yet widely adopted in game evaluations,\nalthough it has been used successfully in natural language processing, image\nand speech recognition, and computer vision, where the availability of labeled\ndata is limited or expensive. In this paper, we propose the use of active\nlearning for learning level completability classification. Through an active\nlearning approach, we train deep-learning models to classify the completability\nof generated levels for Super Mario Bros., Kid Icarus, and a Zelda-like game.\nWe compare active learning for querying levels to label with completability\nagainst random queries. Our results show using an active learning approach to\nlabel levels results in better classifier performance with the same amount of\nlabeled data.\n","authors":["Mahsa Bazzaz","Seth Cooper"],"pdf_url":"https://arxiv.org/pdf/2309.04367v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.02060v2","updated":"2023-09-08T14:45:48Z","published":"2023-08-03T21:49:14Z","title":"Accurate Neural Network Pruning Requires Rethinking Sparse Optimization","summary":" Obtaining versions of deep neural networks that are both highly-accurate and\nhighly-sparse is one of the main challenges in the area of model compression,\nand several high-performance pruning techniques have been investigated by the\ncommunity. Yet, much less is known about the interaction between sparsity and\nthe standard stochastic optimization techniques used for training sparse\nnetworks, and most existing work uses standard dense schedules and\nhyperparameters for training sparse networks. In this work, we examine the\nimpact of high sparsity on model training using the standard computer vision\nand natural language processing sparsity benchmarks. We begin by showing that\nusing standard dense training recipes for sparse training is suboptimal, and\nresults in under-training. We provide new approaches for mitigating this issue\nfor both sparse pre-training of vision models (e.g. ResNet50/ImageNet) and\nsparse fine-tuning of language models (e.g. BERT/GLUE), achieving\nstate-of-the-art results in both settings in the high-sparsity regime, and\nproviding detailed analyses for the difficulty of sparse training in both\nscenarios. Our work sets a new threshold in terms of the accuracies that can be\nachieved under high sparsity, and should inspire further research into\nimproving sparse model training, to reach higher accuracies under high\nsparsity, but also to do so efficiently.\n","authors":["Denis Kuznedelev","Eldar Kurtic","Eugenia Iofinova","Elias Frantar","Alexandra Peste","Dan Alistarh"],"pdf_url":"https://arxiv.org/pdf/2308.02060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04361v1","updated":"2023-09-08T14:41:21Z","published":"2023-09-08T14:41:21Z","title":"Learning from Power Signals: An Automated Approach to Electrical\n Disturbance Identification Within a Power Transmission System","summary":" As power quality becomes a higher priority in the electric utility industry,\nthe amount of disturbance event data continues to grow. Utilities do not have\nthe required personnel to analyze each event by hand. This work presents an\nautomated approach for analyzing power quality events recorded by digital fault\nrecorders and power quality monitors operating within a power transmission\nsystem. The automated approach leverages rule-based analytics to examine the\ntime and frequency domain characteristics of the voltage and current signals.\nCustomizable thresholds are set to categorize each disturbance event. The\nevents analyzed within this work include various faults, motor starting, and\nincipient instrument transformer failure. Analytics for fourteen different\nevent types have been developed. The analytics were tested on 160 signal files\nand yielded an accuracy of ninety-nine percent. Continuous, nominal signal data\nanalysis is performed using an approach coined as the cyclic histogram. The\ncyclic histogram process will be integrated into the digital fault recorders\nthemselves to facilitate the detection of subtle signal variations that are too\nsmall to trigger a disturbance event and that can occur over hours or days. In\naddition to reducing memory requirements by a factor of 320, it is anticipated\nthat cyclic histogram processing will aid in identifying incipient events and\nidentifiers. This project is expected to save engineers time by automating the\nclassification of disturbance events and increase the reliability of the\ntransmission system by providing near real time detection and identification of\ndisturbances as well as prevention of problems before they occur.\n","authors":["Jonathan D. Boyd","Joshua H. Tyler","Anthony M. Murphy","Donald R. Reising"],"pdf_url":"https://arxiv.org/pdf/2309.04361v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2307.05551v2","updated":"2023-09-08T14:25:40Z","published":"2023-07-09T09:08:38Z","title":"Graph Neural Network-enabled Terahertz-based Flow-guided Nanoscale\n Localization","summary":" Scientific advancements in nanotechnology and advanced materials are paving\nthe way toward nanoscale devices for in-body precision medicine; comprising\nintegrated sensing, computing, communication, data and energy storage\ncapabilities. In the human cardiovascular system, such devices are envisioned\nto be passively flowing and continuously sensing for detecting events of\ndiagnostic interest. The diagnostic value of detecting such events can be\nenhanced by assigning to them their physical locations (e.g., body region),\nwhich is the main proposition of flow-guided localization. Current flow-guided\nlocalization approaches suffer from low localization accuracy and they are\nby-design unable to localize events within the entire cardiovascular system.\nToward addressing this issue, we propose the utilization of Graph Neural\nNetworks (GNNs) for this purpose, and demonstrate localization accuracy and\ncoverage enhancements of our proposal over the existing State of the Art (SotA)\napproaches. Based on our evaluation, we provide several design guidelines for\nGNN-enabled flow-guided localization.\n","authors":["Gerard Calvo Bartra","Filip Lemic","Jakob Struye","Sergi Abadal","Xavier Costa Perez"],"pdf_url":"https://arxiv.org/pdf/2307.05551v2.pdf","comment":"6 pages, 5 figures, 1 table, 15 references. arXiv admin note: text\n overlap with arXiv:2305.18493"},{"id":"http://arxiv.org/abs/2309.04355v1","updated":"2023-09-08T14:24:40Z","published":"2023-09-08T14:24:40Z","title":"Value-Compressed Sparse Column (VCSC): Sparse Matrix Storage for\n Redundant Data","summary":" Compressed Sparse Column (CSC) and Coordinate (COO) are popular compression\nformats for sparse matrices. However, both CSC and COO are general purpose and\ncannot take advantage of any of the properties of the data other than sparsity,\nsuch as data redundancy. Highly redundant sparse data is common in many machine\nlearning applications, such as genomics, and is often too large for in-core\ncomputation using conventional sparse storage formats. In this paper, we\npresent two extensions to CSC: (1) Value-Compressed Sparse Column (VCSC) and\n(2) Index- and Value-Compressed Sparse Column (IVCSC). VCSC takes advantage of\nhigh redundancy within a column to further compress data up to 3-fold over COO\nand 2.25-fold over CSC, without significant negative impact to performance\ncharacteristics. IVCSC extends VCSC by compressing index arrays through delta\nencoding and byte-packing, achieving a 10-fold decrease in memory usage over\nCOO and 7.5-fold decrease over CSC. Our benchmarks on simulated and real data\nshow that VCSC and IVCSC can be read in compressed form with little added\ncomputational cost. These two novel compression formats offer a broadly useful\nsolution to encoding and reading redundant sparse data.\n","authors":["Skyler Ruiter","Seth Wolfgang","Marc Tunnell","Timothy Triche Jr.","Erin Carrier","Zachary DeBruine"],"pdf_url":"https://arxiv.org/pdf/2309.04355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04354v1","updated":"2023-09-08T14:24:10Z","published":"2023-09-08T14:24:10Z","title":"Mobile V-MoEs: Scaling Down Vision Transformers via Sparse\n Mixture-of-Experts","summary":" Sparse Mixture-of-Experts models (MoEs) have recently gained popularity due\nto their ability to decouple model size from inference efficiency by only\nactivating a small subset of the model parameters for any given input token. As\nsuch, sparse MoEs have enabled unprecedented scalability, resulting in\ntremendous successes across domains such as natural language processing and\ncomputer vision. In this work, we instead explore the use of sparse MoEs to\nscale-down Vision Transformers (ViTs) to make them more attractive for\nresource-constrained vision applications. To this end, we propose a simplified\nand mobile-friendly MoE design where entire images rather than individual\npatches are routed to the experts. We also propose a stable MoE training\nprocedure that uses super-class information to guide the router. We empirically\nshow that our sparse Mobile Vision MoEs (V-MoEs) can achieve a better trade-off\nbetween performance and efficiency than the corresponding dense ViTs. For\nexample, for the ViT-Tiny model, our Mobile V-MoE outperforms its dense\ncounterpart by 3.39% on ImageNet-1k. For an even smaller ViT variant with only\n54M FLOPs inference cost, our MoE achieves an improvement of 4.66%.\n","authors":["Erik Daxberger","Floris Weers","Bowen Zhang","Tom Gunter","Ruoming Pang","Marcin Eichner","Michael Emmersberger","Yinfei Yang","Alexander Toshev","Xianzhi Du"],"pdf_url":"https://arxiv.org/pdf/2309.04354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04344v1","updated":"2023-09-08T14:15:47Z","published":"2023-09-08T14:15:47Z","title":"Zero-Shot Robustification of Zero-Shot Models With Foundation Models","summary":" Zero-shot inference is a powerful paradigm that enables the use of large\npretrained models for downstream classification tasks without further training.\nHowever, these models are vulnerable to inherited biases that can impact their\nperformance. The traditional solution is fine-tuning, but this undermines the\nkey advantage of pretrained models, which is their ability to be used\nout-of-the-box. We propose RoboShot, a method that improves the robustness of\npretrained model embeddings in a fully zero-shot fashion. First, we use\nzero-shot language models (LMs) to obtain useful insights from task\ndescriptions. These insights are embedded and used to remove harmful and boost\nuseful components in embeddings -- without any supervision. Theoretically, we\nprovide a simple and tractable model for biases in zero-shot embeddings and\ngive a result characterizing under what conditions our approach can boost\nperformance. Empirically, we evaluate RoboShot on nine image and NLP\nclassification tasks and show an average improvement of 15.98% over several\nzero-shot baselines. Additionally, we demonstrate that RoboShot is compatible\nwith a variety of pretrained and language models.\n","authors":["Dyah Adila","Changho Shin","Linrong Cai","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2309.04344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04339v1","updated":"2023-09-08T14:08:19Z","published":"2023-09-08T14:08:19Z","title":"Online Submodular Maximization via Online Convex Optimization","summary":" We study monotone submodular maximization under general matroid constraints\nin the online setting. We prove that online optimization of a large class of\nsubmodular functions, namely, weighted threshold potential functions, reduces\nto online convex optimization (OCO). This is precisely because functions in\nthis class admit a concave relaxation; as a result, OCO policies, coupled with\nan appropriate rounding scheme, can be used to achieve sublinear regret in the\ncombinatorial setting. We show that our reduction extends to many different\nversions of the online learning problem, including the dynamic regret, bandit,\nand optimistic-learning settings.\n","authors":["T. Si-Salem","G. Özcan","I. Nikolaou","E. Terzi","S. Ioannidis"],"pdf_url":"https://arxiv.org/pdf/2309.04339v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.04333v1","updated":"2023-09-08T14:00:29Z","published":"2023-09-08T14:00:29Z","title":"Encoding Multi-Domain Scientific Papers by Ensembling Multiple CLS\n Tokens","summary":" Many useful tasks on scientific documents, such as topic classification and\ncitation prediction, involve corpora that span multiple scientific domains.\nTypically, such tasks are accomplished by representing the text with a vector\nembedding obtained from a Transformer's single CLS token. In this paper, we\nargue that using multiple CLS tokens could make a Transformer better specialize\nto multiple scientific domains. We present Multi2SPE: it encourages each of\nmultiple CLS tokens to learn diverse ways of aggregating token embeddings, then\nsums them up together to create a single vector representation. We also propose\nour new multi-domain benchmark, Multi-SciDocs, to test scientific paper vector\nencoders under multi-domain settings. We show that Multi2SPE reduces error by\nup to 25 percent in multi-domain citation prediction, while requiring only a\nnegligible amount of computation in addition to one BERT forward pass.\n","authors":["Ronald Seoh","Haw-Shiuan Chang","Andrew McCallum"],"pdf_url":"https://arxiv.org/pdf/2309.04333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04332v1","updated":"2023-09-08T13:59:18Z","published":"2023-09-08T13:59:18Z","title":"Graph Neural Networks Use Graphs When They Shouldn't","summary":" Predictions over graphs play a crucial role in various domains, including\nsocial networks, molecular biology, medicine, and more. Graph Neural Networks\n(GNNs) have emerged as the dominant approach for learning on graph data.\nInstances of graph labeling problems consist of the graph-structure (i.e., the\nadjacency matrix), along with node-specific feature vectors. In some cases,\nthis graph-structure is non-informative for the predictive task. For instance,\nmolecular properties such as molar mass depend solely on the constituent atoms\n(node features), and not on the molecular structure. While GNNs have the\nability to ignore the graph-structure in such cases, it is not clear that they\nwill. In this work, we show that GNNs actually tend to overfit the\ngraph-structure in the sense that they use it even when a better solution can\nbe obtained by ignoring it. We examine this phenomenon with respect to\ndifferent graph distributions and find that regular graphs are more robust to\nthis overfitting. We then provide a theoretical explanation for this\nphenomenon, via analyzing the implicit bias of gradient-descent-based learning\nof GNNs in this setting. Finally, based on our empirical and theoretical\nfindings, we propose a graph-editing method to mitigate the tendency of GNNs to\noverfit graph-structures that should be ignored. We show that this method\nindeed improves the accuracy of GNNs across multiple benchmarks.\n","authors":["Maya Bechler-Speicher","Ido Amos","Ran Gilad-Bachrach","Amir Globerson"],"pdf_url":"https://arxiv.org/pdf/2309.04332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.02760v4","updated":"2023-09-08T13:59:01Z","published":"2022-07-06T15:53:17Z","title":"TREE-G: Decision Trees Contesting Graph Neural Networks","summary":" When dealing with tabular data, models based on decision trees are a popular\nchoice due to their high accuracy on these data types, their ease of\napplication, and explainability properties. However, when it comes to\ngraph-structured data, it is not clear how to apply them effectively, in a way\nthat incorporates the topological information with the tabular data available\non the vertices of the graph. To address this challenge, we introduce TREE-G.\nTREE-G modifies standard decision trees, by introducing a novel split function\nthat is specialized for graph data. Not only does this split function\nincorporate the node features and the topological information, but it also uses\na novel pointer mechanism that allows split nodes to use information computed\nin previous splits. Therefore, the split function adapts to the predictive task\nand the graph at hand. We analyze the theoretical properties of TREE-G and\ndemonstrate its benefits empirically on multiple graph and vertex prediction\nbenchmarks. In these experiments, TREE-G consistently outperforms other\ntree-based models and often outperforms other graph-learning algorithms such as\nGraph Neural Networks (GNNs) and Graph Kernels, sometimes by large margins.\nMoreover, TREE-Gs models and their predictions can be explained and visualized\n","authors":["Maya Bechler-Speicher","Amir Globerson","Ran Gilad-Bachrach"],"pdf_url":"https://arxiv.org/pdf/2207.02760v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12693v3","updated":"2023-09-08T13:38:27Z","published":"2022-09-23T11:37:02Z","title":"Leveraging the Potential of Novel Data in Power Line Communication of\n Electricity Grids","summary":" Electricity grids have become an essential part of daily life, even if they\nare often not noticed in everyday life. We usually only become particularly\naware of this dependence by the time the electricity grid is no longer\navailable. However, significant changes, such as the transition to renewable\nenergy (photovoltaic, wind turbines, etc.) and an increasing number of energy\nconsumers with complex load profiles (electric vehicles, home battery systems,\netc.), pose new challenges for the electricity grid. To address these\nchallenges, we propose two first-of-its-kind datasets based on measurements in\na broadband powerline communications (PLC) infrastructure. Both datasets FiN-1\nand FiN-2, were collected during real practical use in a part of the German\nlow-voltage grid that supplies around 4.4 million people and show more than 13\nbillion datapoints collected by more than 5100 sensors. In addition, we present\ndifferent use cases in asset management, grid state visualization, forecasting,\npredictive maintenance, and novelty detection to highlight the benefits of\nthese types of data. For these applications, we particularly highlight the use\nof novel machine learning architectures to extract rich information from\nreal-world data that cannot be captured using traditional approaches. By\npublishing the first large-scale real-world dataset, we aim to shed light on\nthe previously largely unrecognized potential of PLC data and emphasize\nmachine-learning-based research in low-voltage distribution networks by\npresenting a variety of different use cases.\n","authors":["Christoph Balada","Max Bondorf","Sheraz Ahmed","Andreas Dengela","Markus Zdrallek"],"pdf_url":"https://arxiv.org/pdf/2209.12693v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04318v1","updated":"2023-09-08T13:31:06Z","published":"2023-09-08T13:31:06Z","title":"Generating the Ground Truth: Synthetic Data for Label Noise Research","summary":" Most real-world classification tasks suffer from label noise to some extent.\nSuch noise in the data adversely affects the generalization error of learned\nmodels and complicates the evaluation of noise-handling methods, as their\nperformance cannot be accurately measured without clean labels. In label noise\nresearch, typically either noisy or incomplex simulated data are accepted as a\nbaseline, into which additional noise with known properties is injected. In\nthis paper, we propose SYNLABEL, a framework that aims to improve upon the\naforementioned methodologies. It allows for creating a noiseless dataset\ninformed by real data, by either pre-specifying or learning a function and\ndefining it as the ground truth function from which labels are generated.\nFurthermore, by resampling a number of values for selected features in the\nfunction domain, evaluating the function and aggregating the resulting labels,\neach data point can be assigned a soft label or label distribution. Such\ndistributions allow for direct injection and quantification of label noise. The\ngenerated datasets serve as a clean baseline of adjustable complexity into\nwhich different types of noise may be introduced. We illustrate how the\nframework can be applied, how it enables quantification of label noise and how\nit improves over existing methodologies.\n","authors":["Sjoerd de Vries","Dirk Thierens"],"pdf_url":"https://arxiv.org/pdf/2309.04318v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04317v1","updated":"2023-09-08T13:29:57Z","published":"2023-09-08T13:29:57Z","title":"Actor critic learning algorithms for mean-field control with moment\n neural networks","summary":" We develop a new policy gradient and actor-critic algorithm for solving\nmean-field control problems within a continuous time reinforcement learning\nsetting. Our approach leverages a gradient-based representation of the value\nfunction, employing parametrized randomized policies. The learning for both the\nactor (policy) and critic (value function) is facilitated by a class of moment\nneural network functions on the Wasserstein space of probability measures, and\nthe key feature is to sample directly trajectories of distributions. A central\nchallenge addressed in this study pertains to the computational treatment of an\noperator specific to the mean-field framework. To illustrate the effectiveness\nof our methods, we provide a comprehensive set of numerical results. These\nencompass diverse examples, including multi-dimensional settings and nonlinear\nquadratic mean-field control problems with controlled volatility.\n","authors":["Huyên Pham","Xavier Warin"],"pdf_url":"https://arxiv.org/pdf/2309.04317v1.pdf","comment":"16 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.04311v1","updated":"2023-09-08T13:17:06Z","published":"2023-09-08T13:17:06Z","title":"Federated Learning for Early Dropout Prediction on Healthy Ageing\n Applications","summary":" The provision of social care applications is crucial for elderly people to\nimprove their quality of life and enables operators to provide early\ninterventions. Accurate predictions of user dropouts in healthy ageing\napplications are essential since they are directly related to individual health\nstatuses. Machine Learning (ML) algorithms have enabled highly accurate\npredictions, outperforming traditional statistical methods that struggle to\ncope with individual patterns. However, ML requires a substantial amount of\ndata for training, which is challenging due to the presence of personal\nidentifiable information (PII) and the fragmentation posed by regulations. In\nthis paper, we present a federated machine learning (FML) approach that\nminimizes privacy concerns and enables distributed training, without\ntransferring individual data. We employ collaborative training by considering\nindividuals and organizations under FML, which models both cross-device and\ncross-silo learning scenarios. Our approach is evaluated on a real-world\ndataset with non-independent and identically distributed (non-iid) data among\nclients, class imbalance and label ambiguity. Our results show that data\nselection and class imbalance handling techniques significantly improve the\npredictive accuracy of models trained under FML, demonstrating comparable or\nsuperior predictive performance than traditional ML models.\n","authors":["Christos Chrysanthos Nikolaidis","Vasileios Perifanis","Nikolaos Pavlidis","Pavlos S. Efraimidis"],"pdf_url":"https://arxiv.org/pdf/2309.04311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02603v2","updated":"2023-09-08T13:14:16Z","published":"2023-08-04T09:11:37Z","title":"Knowledge-Driven Multi-Agent Reinforcement Learning for Computation\n Offloading in Cybertwin-Enabled Internet of Vehicles","summary":" By offloading computation-intensive tasks of vehicles to roadside units\n(RSUs), mobile edge computing (MEC) in the Internet of Vehicles (IoV) can\nrelieve the onboard computation burden. However, existing model-based task\noffloading methods suffer from heavy computational complexity with the increase\nof vehicles and data-driven methods lack interpretability. To address these\nchallenges, in this paper, we propose a knowledge-driven multi-agent\nreinforcement learning (KMARL) approach to reduce the latency of task\noffloading in cybertwin-enabled IoV. Specifically, in the considered scenario,\nthe cybertwin serves as a communication agent for each vehicle to exchange\ninformation and make offloading decisions in the virtual space. To reduce the\nlatency of task offloading, a KMARL approach is proposed to select the optimal\noffloading option for each vehicle, where graph neural networks are employed by\nleveraging domain knowledge concerning graph-structure communication topology\nand permutation invariance into neural networks. Numerical results show that\nour proposed KMARL yields higher rewards and demonstrates improved scalability\ncompared with other methods, benefitting from the integration of domain\nknowledge.\n","authors":["Ruijin Sun","Xiao Yang","Nan Cheng","Xiucheng Wang","Changle Li"],"pdf_url":"https://arxiv.org/pdf/2308.02603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06603v2","updated":"2023-09-08T13:03:24Z","published":"2023-08-12T16:14:44Z","title":"LadleNet: Translating Thermal Infrared Images to Visible Light Images\n Using A Scalable Two-stage U-Net","summary":" The translation of thermal infrared (TIR) images to visible light (VI) images\npresents a challenging task with potential applications spanning various\ndomains such as TIR-VI image registration and fusion. Leveraging supplementary\ninformation derived from TIR image conversions can significantly enhance model\nperformance and generalization across these applications. However, prevailing\nissues within this field include suboptimal image fidelity and limited model\nscalability. In this paper, we introduce an algorithm, LadleNet, based on the\nU-Net architecture. LadleNet employs a two-stage U-Net concatenation structure,\naugmented with skip connections and refined feature aggregation techniques,\nresulting in a substantial enhancement in model performance. Comprising\n'Handle' and 'Bowl' modules, LadleNet's Handle module facilitates the\nconstruction of an abstract semantic space, while the Bowl module decodes this\nsemantic space to yield mapped VI images. The Handle module exhibits\nextensibility by allowing the substitution of its network architecture with\nsemantic segmentation networks, thereby establishing more abstract semantic\nspaces to bolster model performance. Consequently, we propose LadleNet+, which\nreplaces LadleNet's Handle module with the pre-trained DeepLabv3+ network,\nthereby endowing the model with enhanced semantic space construction\ncapabilities. The proposed method is evaluated and tested on the KAIST dataset,\naccompanied by quantitative and qualitative analyses. Compared to existing\nmethodologies, our approach achieves state-of-the-art performance in terms of\nimage clarity and perceptual quality. The source code will be made available at\nhttps://github.com/Ach-1914/LadleNet/tree/main/.\n","authors":["Tonghui Zou","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2308.06603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19659v2","updated":"2023-09-08T13:02:31Z","published":"2023-05-31T08:46:11Z","title":"Improving Expressivity of Graph Neural Networks using Localization","summary":" In this paper, we propose localized versions of Weisfeiler-Leman (WL)\nalgorithms in an effort to both increase the expressivity, as well as decrease\nthe computational overhead. We focus on the specific problem of subgraph\ncounting and give localized versions of $k-$WL for any $k$. We analyze the\npower of Local $k-$WL and prove that it is more expressive than $k-$WL and at\nmost as expressive as $(k+1)-$WL. We give a characterization of patterns whose\ncount as a subgraph and induced subgraph are invariant if two graphs are Local\n$k-$WL equivalent. We also introduce two variants of $k-$WL: Layer $k-$WL and\nrecursive $k-$WL. These methods are more time and space efficient than applying\n$k-$WL on the whole graph. We also propose a fragmentation technique that\nguarantees the exact count of all induced subgraphs of size at most 4 using\njust $1-$WL. The same idea can be extended further for larger patterns using\n$k>1$. We also compare the expressive power of Local $k-$WL with other GNN\nhierarchies and show that given a bound on the time-complexity, our methods are\nmore expressive than the ones mentioned in Papp and Wattenhofer[2022a].\n","authors":["Anant Kumar","Shrutimoy Das","Shubhajit Roy","Binita Maity","Anirban Dasgupta"],"pdf_url":"https://arxiv.org/pdf/2305.19659v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04296v1","updated":"2023-09-08T12:36:49Z","published":"2023-09-08T12:36:49Z","title":"Navigating Out-of-Distribution Electricity Load Forecasting during\n COVID-19: A Continual Learning Approach Leveraging Human Mobility","summary":" In traditional deep learning algorithms, one of the key assumptions is that\nthe data distribution remains constant during both training and deployment.\nHowever, this assumption becomes problematic when faced with\nOut-of-Distribution periods, such as the COVID-19 lockdowns, where the data\ndistribution significantly deviates from what the model has seen during\ntraining. This paper employs a two-fold strategy: utilizing continual learning\ntechniques to update models with new data and harnessing human mobility data\ncollected from privacy-preserving pedestrian counters located outside\nbuildings. In contrast to online learning, which suffers from 'catastrophic\nforgetting' as newly acquired knowledge often erases prior information,\ncontinual learning offers a holistic approach by preserving past insights while\nintegrating new data. This research applies FSNet, a powerful continual\nlearning algorithm, to real-world data from 13 building complexes in Melbourne,\nAustralia, a city which had the second longest total lockdown duration globally\nduring the pandemic. Results underscore the crucial role of continual learning\nin accurate energy forecasting, particularly during Out-of-Distribution\nperiods. Secondary data such as mobility and temperature provided ancillary\nsupport to the primary forecasting model. More importantly, while traditional\nmethods struggled to adapt during lockdowns, models featuring at least online\nlearning demonstrated resilience, with lockdown periods posing fewer challenges\nonce armed with adaptive learning techniques. This study contributes valuable\nmethodologies and insights to the ongoing effort to improve energy load\nforecasting during future Out-of-Distribution periods.\n","authors":["Arian Prabowo","Kaixuan Chen","Hao Xue","Subbu Sethuvenkatraman","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2309.04296v1.pdf","comment":"10 pages, 2 figures, 5 tables, BuildSys '23"},{"id":"http://arxiv.org/abs/2302.10798v3","updated":"2023-09-08T12:36:08Z","published":"2023-02-17T09:37:17Z","title":"Learning a Consensus Sub-Network with Polarization Regularization and\n One Pass Training","summary":" The subject of green AI has been gaining attention within the deep learning\ncommunity given the recent trend of ever larger and more complex neural network\nmodels. Existing solutions for reducing the computational load of training at\ninference time usually involve pruning the network parameters. Pruning schemes\noften create extra overhead either by iterative training and fine-tuning for\nstatic pruning or repeated computation of a dynamic pruning graph. We propose a\nnew parameter pruning strategy for learning a lighter-weight sub-network that\nminimizes the energy cost while maintaining comparable performance to the fully\nparameterised network on given downstream tasks. Our proposed pruning scheme is\ngreen-oriented, as it only requires a one-off training to discover the optimal\nstatic sub-networks by dynamic pruning methods. The pruning scheme consists of\na binary gating module and a novel loss function to uncover sub-networks with\nuser-defined sparsity. Our method enables pruning and training simultaneously,\nwhich saves energy in both the training and inference phases and avoids extra\ncomputational overhead from gating modules at inference time. Our results on\nCIFAR-10 and CIFAR-100 suggest that our scheme can remove 50% of connections in\ndeep networks with less than 1% reduction in classification accuracy. Compared\nto other related pruning methods, our method demonstrates a lower drop in\naccuracy for equivalent reductions in computational cost.\n","authors":["Xiaoying Zhi","Varun Babbar","Pheobe Sun","Fran Silavong","Ruibo Shi","Sean Moran"],"pdf_url":"https://arxiv.org/pdf/2302.10798v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14720v2","updated":"2023-09-08T12:20:23Z","published":"2023-03-26T13:15:44Z","title":"Driver Profiling and Bayesian Workload Estimation Using Naturalistic\n Peripheral Detection Study Data","summary":" Monitoring drivers' mental workload facilitates initiating and maintaining\nsafe interactions with in-vehicle information systems, and thus delivers\nadaptive human machine interaction with reduced impact on the primary task of\ndriving. In this paper, we tackle the problem of workload estimation from\ndriving performance data. First, we present a novel on-road study for\ncollecting subjective workload data via a modified peripheral detection task in\nnaturalistic settings. Key environmental factors that induce a high mental\nworkload are identified via video analysis, e.g. junctions and behaviour of\nvehicle in front. Second, a supervised learning framework using\nstate-of-the-art time series classifiers (e.g. convolutional neural network and\ntransform techniques) is introduced to profile drivers based on the average\nworkload they experience during a journey. A Bayesian filtering approach is\nthen proposed for sequentially estimating, in (near) real-time, the driver's\ninstantaneous workload. This computationally efficient and flexible method can\nbe easily personalised to a driver (e.g. incorporate their inferred average\nworkload profile), adapted to driving/environmental contexts (e.g. road type)\nand extended with data streams from new sources. The efficacy of the presented\nprofiling and instantaneous workload estimation approaches are demonstrated\nusing the on-road study data, showing $F_{1}$ scores of up to 92% and 81%,\nrespectively.\n","authors":["Nermin Caber","Bashar I. Ahmad","Jiaming Liang","Simon Godsill","Alexandra Bremers","Philip Thomas","David Oxtoby","Lee Skrypchuk"],"pdf_url":"https://arxiv.org/pdf/2303.14720v2.pdf","comment":"Accepted for IEEE Transactions on Intelligent Vehicles"},{"id":"http://arxiv.org/abs/2309.04284v1","updated":"2023-09-08T12:06:48Z","published":"2023-09-08T12:06:48Z","title":"Viewing the process of generating counterfactuals as a source of\n knowledge -- Application to the Naive Bayes classifier","summary":" There are now many comprehension algorithms for understanding the decisions\nof a machine learning algorithm. Among these are those based on the generation\nof counterfactual examples. This article proposes to view this generation\nprocess as a source of creating a certain amount of knowledge that can be\nstored to be used, later, in different ways. This process is illustrated in the\nadditive model and, more specifically, in the case of the naive Bayes\nclassifier, whose interesting properties for this purpose are shown.\n","authors":["Vincent Lemaire","Nathan Le Boudec","Françoise Fessant","Victor Guyomard"],"pdf_url":"https://arxiv.org/pdf/2309.04284v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2207.11987v2","updated":"2023-09-08T11:48:47Z","published":"2022-07-25T08:54:36Z","title":"Information Processing Equalities and the Information-Risk Bridge","summary":" We introduce two new classes of measures of information for statistical\nexperiments which generalise and subsume $\\phi$-divergences, integral\nprobability metrics, $\\mathfrak{N}$-distances (MMD), and $(f,\\Gamma)$\ndivergences between two or more distributions. This enables us to derive a\nsimple geometrical relationship between measures of information and the Bayes\nrisk of a statistical decision problem, thus extending the variational\n$\\phi$-divergence representation to multiple distributions in an entirely\nsymmetric manner. The new families of divergence are closed under the action of\nMarkov operators which yields an information processing equality which is a\nrefinement and generalisation of the classical data processing inequality. This\nequality gives insight into the significance of the choice of the hypothesis\nclass in classical risk minimization.\n","authors":["Robert C. Williamson","Zac Cranko"],"pdf_url":"https://arxiv.org/pdf/2207.11987v2.pdf","comment":"48 pages; corrected some typos and added a few additional\n explanations"},{"id":"http://arxiv.org/abs/2309.04272v1","updated":"2023-09-08T11:47:31Z","published":"2023-09-08T11:47:31Z","title":"Learning Zero-Sum Linear Quadratic Games with Improved Sample Complexity","summary":" Zero-sum Linear Quadratic (LQ) games are fundamental in optimal control and\ncan be used (i) as a dynamic game formulation for risk-sensitive or robust\ncontrol, or (ii) as a benchmark setting for multi-agent reinforcement learning\nwith two competing agents in continuous state-control spaces. In contrast to\nthe well-studied single-agent linear quadratic regulator problem, zero-sum LQ\ngames entail solving a challenging nonconvex-nonconcave min-max problem with an\nobjective function that lacks coercivity. Recently, Zhang et al. discovered an\nimplicit regularization property of natural policy gradient methods which is\ncrucial for safety-critical control systems since it preserves the robustness\nof the controller during learning. Moreover, in the model-free setting where\nthe knowledge of model parameters is not available, Zhang et al. proposed the\nfirst polynomial sample complexity algorithm to reach an\n$\\epsilon$-neighborhood of the Nash equilibrium while maintaining the desirable\nimplicit regularization property. In this work, we propose a simpler nested\nZeroth-Order (ZO) algorithm improving sample complexity by several orders of\nmagnitude. Our main result guarantees a\n$\\widetilde{\\mathcal{O}}(\\epsilon^{-3})$ sample complexity under the same\nassumptions using a single-point ZO estimator. Furthermore, when the estimator\nis replaced by a two-point estimator, our method enjoys a better\n$\\widetilde{\\mathcal{O}}(\\epsilon^{-2})$ sample complexity. Our key\nimprovements rely on a more sample-efficient nested algorithm design and finer\ncontrol of the ZO natural gradient estimation error.\n","authors":["Jiduan Wu","Anas Barakat","Ilyas Fatkhullin","Niao He"],"pdf_url":"https://arxiv.org/pdf/2309.04272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03600v2","updated":"2023-09-08T11:42:32Z","published":"2023-06-06T11:44:42Z","title":"Avoid Adversarial Adaption in Federated Learning by Multi-Metric\n Investigations","summary":" Federated Learning (FL) facilitates decentralized machine learning model\ntraining, preserving data privacy, lowering communication costs, and boosting\nmodel performance through diversified data sources. Yet, FL faces\nvulnerabilities such as poisoning attacks, undermining model integrity with\nboth untargeted performance degradation and targeted backdoor attacks.\nPreventing backdoors proves especially challenging due to their stealthy\nnature.\n Prominent mitigation techniques against poisoning attacks rely on monitoring\ncertain metrics and filtering malicious model updates. While shown effective in\nevaluations, we argue that previous works didn't consider realistic real-world\nadversaries and data distributions. We define a new notion of strong adaptive\nadversaries, capable of adapting to multiple objectives simultaneously. Through\nextensive empirical tests, we show that existing defense methods can be easily\ncircumvented in this adversary model. We also demonstrate, that existing\ndefenses have limited effectiveness when no assumptions are made about\nunderlying data distributions.\n We introduce Metric-Cascades (MESAS), a novel defense method for more\nrealistic scenarios and adversary models. MESAS employs multiple detection\nmetrics simultaneously to identify poisoned model updates, creating a complex\nmulti-objective optimization problem for adaptive attackers. In our extensive\nevaluation featuring nine backdoors and three datasets, MESAS consistently\ndetects even strong adaptive attackers. Furthermore, MESAS outperforms existing\ndefenses in distinguishing backdoors from data distribution-related distortions\nwithin and across clients. MESAS is the first defense robust against strong\nadaptive adversaries, effective in real-world data scenarios, with an average\noverhead of just 24.37 seconds.\n","authors":["Torsten Krauß","Alexandra Dmitrienko"],"pdf_url":"https://arxiv.org/pdf/2306.03600v2.pdf","comment":"25 pages, 14 figures, 23 tables, 11 equations"},{"id":"http://arxiv.org/abs/2309.04268v1","updated":"2023-09-08T11:29:05Z","published":"2023-09-08T11:29:05Z","title":"Optimal Rate of Kernel Regression in Large Dimensions","summary":" We perform a study on kernel regression for large-dimensional data (where the\nsample size $n$ is polynomially depending on the dimension $d$ of the samples,\ni.e., $n\\asymp d^{\\gamma}$ for some $\\gamma >0$ ). We first build a general\ntool to characterize the upper bound and the minimax lower bound of kernel\nregression for large dimensional data through the Mendelson complexity\n$\\varepsilon_{n}^{2}$ and the metric entropy $\\bar{\\varepsilon}_{n}^{2}$\nrespectively. When the target function falls into the RKHS associated with a\n(general) inner product model defined on $\\mathbb{S}^{d}$, we utilize the new\ntool to show that the minimax rate of the excess risk of kernel regression is\n$n^{-1/2}$ when $n\\asymp d^{\\gamma}$ for $\\gamma =2, 4, 6, 8, \\cdots$. We then\nfurther determine the optimal rate of the excess risk of kernel regression for\nall the $\\gamma>0$ and find that the curve of optimal rate varying along\n$\\gamma$ exhibits several new phenomena including the {\\it multiple descent\nbehavior} and the {\\it periodic plateau behavior}. As an application, For the\nneural tangent kernel (NTK), we also provide a similar explicit description of\nthe curve of optimal rate. As a direct corollary, we know these claims hold for\nwide neural networks as well.\n","authors":["Weihao Lu","Haobo Zhang","Yicheng Li","Manyun Xu","Qian Lin"],"pdf_url":"https://arxiv.org/pdf/2309.04268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14839v2","updated":"2023-09-08T11:22:30Z","published":"2023-07-27T13:18:52Z","title":"Kernelized Normalizing Flows","summary":" Normalising Flows are generative models characterised by their invertible\narchitecture. However, the requirement of invertibility imposes constraints on\ntheir expressiveness, necessitating a large number of parameters and innovative\narchitectural designs to achieve satisfactory outcomes. Whilst flow-based\nmodels predominantly rely on neural-network-based transformations for\nexpressive designs, alternative transformation methods have received limited\nattention. In this work, we present Ferumal flow, a novel kernelised\nnormalising flow paradigm that integrates kernels into the framework. Our\nresults demonstrate that a kernelised flow can yield competitive or superior\nresults compared to neural network-based flows whilst maintaining parameter\nefficiency. Kernelised flows excel especially in the low-data regime, enabling\nflexible non-parametric density estimation in applications with sparse data\navailability.\n","authors":["Eshant English","Matthias Kirchler","Christoph Lippert"],"pdf_url":"https://arxiv.org/pdf/2307.14839v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03626v2","updated":"2023-09-08T10:13:25Z","published":"2023-05-05T15:37:23Z","title":"Verifiable Learning for Robust Tree Ensembles","summary":" Verifying the robustness of machine learning models against evasion attacks\nat test time is an important research problem. Unfortunately, prior work\nestablished that this problem is NP-hard for decision tree ensembles, hence\nbound to be intractable for specific inputs. In this paper, we identify a\nrestricted class of decision tree ensembles, called large-spread ensembles,\nwhich admit a security verification algorithm running in polynomial time. We\nthen propose a new approach called verifiable learning, which advocates the\ntraining of such restricted model classes which are amenable for efficient\nverification. We show the benefits of this idea by designing a new training\nalgorithm that automatically learns a large-spread decision tree ensemble from\nlabelled data, thus enabling its security verification in polynomial time.\nExperimental results on public datasets confirm that large-spread ensembles\ntrained using our algorithm can be verified in a matter of seconds, using\nstandard commercial hardware. Moreover, large-spread ensembles are more robust\nthan traditional ensembles against evasion attacks, at the cost of an\nacceptable loss of accuracy in the non-adversarial setting.\n","authors":["Stefano Calzavara","Lorenzo Cazzaro","Giulio Ermanno Pibiri","Nicola Prezza"],"pdf_url":"https://arxiv.org/pdf/2305.03626v2.pdf","comment":"19 pages, 5 figures; full version of the revised paper accepted at\n ACM CCS 2023"},{"id":"http://arxiv.org/abs/2308.15363v2","updated":"2023-09-08T10:13:16Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborate their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar. To\nexplore the potential of open-source LLM, we investigate them in various\nscenarios, and further enhance their performance with supervised fine-tuning.\nOur explorations highlight open-source LLMs' potential in Text-to-SQL, as well\nas the advantages and disadvantages of the supervised fine-tuning.\nAdditionally, towards an efficient and economic LLM-based Text-to-SQL solution,\nwe emphasize the token efficiency in prompt engineering and compare the prior\nstudies under this metric. We hope that our work provides a deeper\nunderstanding of Text-to-SQL with LLMs, and inspires further investigations and\nbroad applications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v2.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2309.01706v2","updated":"2023-09-08T09:56:20Z","published":"2023-09-04T16:35:04Z","title":"On the Robustness of Post-hoc GNN Explainers to Label Noise","summary":" Proposed as a solution to the inherent black-box limitations of graph neural\nnetworks (GNNs), post-hoc GNN explainers aim to provide precise and insightful\nexplanations of the behaviours exhibited by trained GNNs. Despite their recent\nnotable advancements in academic and industrial contexts, the robustness of\npost-hoc GNN explainers remains unexplored when confronted with label noise. To\nbridge this gap, we conduct a systematic empirical investigation to evaluate\nthe efficacy of diverse post-hoc GNN explainers under varying degrees of label\nnoise. Our results reveal several key insights: Firstly, post-hoc GNN\nexplainers are susceptible to label perturbations. Secondly, even minor levels\nof label noise, inconsequential to GNN performance, harm the quality of\ngenerated explanations substantially. Lastly, we engage in a discourse\nregarding the progressive recovery of explanation effectiveness with escalating\nnoise levels.\n","authors":["Zhiqiang Zhong","Yangqianzi Jiang","Davide Mottin"],"pdf_url":"https://arxiv.org/pdf/2309.01706v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04236v1","updated":"2023-09-08T09:54:36Z","published":"2023-09-08T09:54:36Z","title":"Adaptive Distributed Kernel Ridge Regression: A Feasible Distributed\n Learning Scheme for Data Silos","summary":" Data silos, mainly caused by privacy and interoperability, significantly\nconstrain collaborations among different organizations with similar data for\nthe same purpose. Distributed learning based on divide-and-conquer provides a\npromising way to settle the data silos, but it suffers from several challenges,\nincluding autonomy, privacy guarantees, and the necessity of collaborations.\nThis paper focuses on developing an adaptive distributed kernel ridge\nregression (AdaDKRR) by taking autonomy in parameter selection, privacy in\ncommunicating non-sensitive information, and the necessity of collaborations in\nperformance improvement into account. We provide both solid theoretical\nverification and comprehensive experiments for AdaDKRR to demonstrate its\nfeasibility and effectiveness. Theoretically, we prove that under some mild\nconditions, AdaDKRR performs similarly to running the optimal learning\nalgorithms on the whole data, verifying the necessity of collaborations and\nshowing that no other distributed learning scheme can essentially beat AdaDKRR\nunder the same conditions. Numerically, we test AdaDKRR on both toy simulations\nand two real-world applications to show that AdaDKRR is superior to other\nexisting distributed learning schemes. All these results show that AdaDKRR is a\nfeasible scheme to defend against data silos, which are highly desired in\nnumerous application regions such as intelligent decision-making, pricing\nforecasting, and performance prediction for products.\n","authors":["Di Wang","Xiaotong Liu","Shao-Bo Lin","Ding-Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.04236v1.pdf","comment":"46pages, 13figures"},{"id":"http://arxiv.org/abs/2210.03516v4","updated":"2023-09-08T09:33:41Z","published":"2022-10-06T11:06:39Z","title":"Neuroevolution is a Competitive Alternative to Reinforcement Learning\n for Skill Discovery","summary":" Deep Reinforcement Learning (RL) has emerged as a powerful paradigm for\ntraining neural policies to solve complex control tasks. However, these\npolicies tend to be overfit to the exact specifications of the task and\nenvironment they were trained on, and thus do not perform well when conditions\ndeviate slightly or when composed hierarchically to solve even more complex\ntasks. Recent work has shown that training a mixture of policies, as opposed to\na single one, that are driven to explore different regions of the state-action\nspace can address this shortcoming by generating a diverse set of behaviors,\nreferred to as skills, that can be collectively used to great effect in\nadaptation tasks or for hierarchical planning. This is typically realized by\nincluding a diversity term - often derived from information theory - in the\nobjective function optimized by RL. However these approaches often require\ncareful hyperparameter tuning to be effective. In this work, we demonstrate\nthat less widely-used neuroevolution methods, specifically Quality Diversity\n(QD), are a competitive alternative to information-theory-augmented RL for\nskill discovery. Through an extensive empirical evaluation comparing eight\nstate-of-the-art algorithms (four flagship algorithms from each line of work)\non the basis of (i) metrics directly evaluating the skills' diversity, (ii) the\nskills' performance on adaptation tasks, and (iii) the skills' performance when\nused as primitives for hierarchical planning; QD methods are found to provide\nequal, and sometimes improved, performance whilst being less sensitive to\nhyperparameters and more scalable. As no single method is found to provide\nnear-optimal performance across all environments, there is a rich scope for\nfurther research which we support by proposing future directions and providing\noptimized open-source implementations.\n","authors":["Felix Chalumeau","Raphael Boige","Bryan Lim","Valentin Macé","Maxime Allard","Arthur Flajolet","Antoine Cully","Thomas Pierrot"],"pdf_url":"https://arxiv.org/pdf/2210.03516v4.pdf","comment":"Camera ready version for ICLR2023 (spotlight)"},{"id":"http://arxiv.org/abs/2309.04222v1","updated":"2023-09-08T09:11:26Z","published":"2023-09-08T09:11:26Z","title":"Offline Recommender System Evaluation under Unobserved Confounding","summary":" Off-Policy Estimation (OPE) methods allow us to learn and evaluate\ndecision-making policies from logged data. This makes them an attractive choice\nfor the offline evaluation of recommender systems, and several recent works\nhave reported successful adoption of OPE methods to this end. An important\nassumption that makes this work is the absence of unobserved confounders:\nrandom variables that influence both actions and rewards at data collection\ntime. Because the data collection policy is typically under the practitioner's\ncontrol, the unconfoundedness assumption is often left implicit, and its\nviolations are rarely dealt with in the existing literature.\n This work aims to highlight the problems that arise when performing\noff-policy estimation in the presence of unobserved confounders, specifically\nfocusing on a recommendation use-case. We focus on policy-based estimators,\nwhere the logging propensities are learned from logged data. We characterise\nthe statistical bias that arises due to confounding, and show how existing\ndiagnostics are unable to uncover such cases. Because the bias depends directly\non the true and unobserved logging propensities, it is non-identifiable. As the\nunconfoundedness assumption is famously untestable, this becomes especially\nproblematic. This paper emphasises this common, yet often overlooked issue.\nThrough synthetic data, we empirically show how na\\\"ive propensity estimation\nunder confounding can lead to severely biased metric estimates that are allowed\nto fly under the radar. We aim to cultivate an awareness among researchers and\npractitioners of this important problem, and touch upon potential research\ndirections towards mitigating its effects.\n","authors":["Olivier Jeunen","Ben London"],"pdf_url":"https://arxiv.org/pdf/2309.04222v1.pdf","comment":"Accepted at the CONSEQUENCES'23 workshop at RecSys '23"},{"id":"http://arxiv.org/abs/2309.04221v1","updated":"2023-09-08T09:11:12Z","published":"2023-09-08T09:11:12Z","title":"Concomitant Group Testing","summary":" In this paper, we introduce a variation of the group testing problem\ncapturing the idea that a positive test requires a combination of multiple\n``types'' of item. Specifically, we assume that there are multiple disjoint\n\\emph{semi-defective sets}, and a test is positive if and only if it contains\nat least one item from each of these sets. The goal is to reliably identify all\nof the semi-defective sets using as few tests as possible, and we refer to this\nproblem as \\textit{Concomitant Group Testing} (ConcGT). We derive a variety of\nalgorithms for this task, focusing primarily on the case that there are two\nsemi-defective sets. Our algorithms are distinguished by (i) whether they are\ndeterministic (zero-error) or randomized (small-error), and (ii) whether they\nare non-adaptive, fully adaptive, or have limited adaptivity (e.g., 2 or 3\nstages). Both our deterministic adaptive algorithm and our randomized\nalgorithms (non-adaptive or limited adaptivity) are order-optimal in broad\nscaling regimes of interest, and improve significantly over baseline results\nthat are based on solving a more general problem as an intermediate step (e.g.,\nhypergraph learning).\n","authors":["Thach V. Bui","Jonathan Scarlett"],"pdf_url":"https://arxiv.org/pdf/2309.04221v1.pdf","comment":"15 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.04211v1","updated":"2023-09-08T08:47:23Z","published":"2023-09-08T08:47:23Z","title":"Counterfactual Explanations via Locally-guided Sequential Algorithmic\n Recourse","summary":" Counterfactuals operationalised through algorithmic recourse have become a\npowerful tool to make artificial intelligence systems explainable.\nConceptually, given an individual classified as y -- the factual -- we seek\nactions such that their prediction becomes the desired class y' -- the\ncounterfactual. This process offers algorithmic recourse that is (1) easy to\ncustomise and interpret, and (2) directly aligned with the goals of each\nindividual. However, the properties of a \"good\" counterfactual are still\nlargely debated; it remains an open challenge to effectively locate a\ncounterfactual along with its corresponding recourse. Some strategies use\ngradient-driven methods, but these offer no guarantees on the feasibility of\nthe recourse and are open to adversarial attacks on carefully created\nmanifolds. This can lead to unfairness and lack of robustness. Other methods\nare data-driven, which mostly addresses the feasibility problem at the expense\nof privacy, security and secrecy as they require access to the entire training\ndata set. Here, we introduce LocalFACE, a model-agnostic technique that\ncomposes feasible and actionable counterfactual explanations using\nlocally-acquired information at each step of the algorithmic recourse. Our\nexplainer preserves the privacy of users by only leveraging data that it\nspecifically requires to construct actionable algorithmic recourse, and\nprotects the model by offering transparency solely in the regions deemed\nnecessary for the intervention.\n","authors":["Edward A. Small","Jeffrey N. Clark","Christopher J. McWilliams","Kacper Sokol","Jeffrey Chan","Flora D. Salim","Raul Santos-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2309.04211v1.pdf","comment":"7 pages, 5 figures, 3 appendix pages"},{"id":"http://arxiv.org/abs/2309.04195v1","updated":"2023-09-08T08:12:29Z","published":"2023-09-08T08:12:29Z","title":"Towards Mitigating Architecture Overfitting in Dataset Distillation","summary":" Dataset distillation methods have demonstrated remarkable performance for\nneural networks trained with very limited training data. However, a significant\nchallenge arises in the form of architecture overfitting: the distilled\ntraining data synthesized by a specific network architecture (i.e., training\nnetwork) generates poor performance when trained by other network architectures\n(i.e., test networks). This paper addresses this issue and proposes a series of\napproaches in both architecture designs and training schemes which can be\nadopted together to boost the generalization performance across different\nnetwork architectures on the distilled training data. We conduct extensive\nexperiments to demonstrate the effectiveness and generality of our methods.\nParticularly, across various scenarios involving different sizes of distilled\ndata, our approaches achieve comparable or superior performance to existing\nmethods when training on the distilled data using networks with larger\ncapacities.\n","authors":["Xuyang Zhong","Chen Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03531v2","updated":"2023-09-08T07:43:50Z","published":"2023-09-07T07:26:27Z","title":"A Robust Negative Learning Approach to Partial Domain Adaptation Using\n Source Prototypes","summary":" This work proposes a robust Partial Domain Adaptation (PDA) framework that\nmitigates the negative transfer problem by incorporating a robust\ntarget-supervision strategy. It leverages ensemble learning and includes\ndiverse, complementary label feedback, alleviating the effect of incorrect\nfeedback and promoting pseudo-label refinement. Rather than relying exclusively\non first-order moments for distribution alignment, our approach offers explicit\nobjectives to optimize intra-class compactness and inter-class separation with\nthe inferred source prototypes and highly-confident target samples in a\ndomain-invariant fashion. Notably, we ensure source data privacy by eliminating\nthe need to access the source data during the adaptation phase through a priori\ninference of source prototypes. We conducted a series of comprehensive\nexperiments, including an ablation analysis, covering a range of partial domain\nadaptation tasks. Comprehensive evaluations on benchmark datasets corroborate\nour framework's enhanced robustness and generalization, demonstrating its\nsuperiority over existing state-of-the-art PDA approaches.\n","authors":["Sandipan Choudhuri","Suli Adeniye","Arunabha Sen"],"pdf_url":"https://arxiv.org/pdf/2309.03531v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11269v2","updated":"2023-09-08T07:28:27Z","published":"2023-08-22T08:29:09Z","title":"Quantum-Inspired Machine Learning: a Survey","summary":" Quantum-inspired Machine Learning (QiML) is a burgeoning field, receiving\nglobal attention from researchers for its potential to leverage principles of\nquantum mechanics within classical computational frameworks. However, current\nreview literature often presents a superficial exploration of QiML, focusing\ninstead on the broader Quantum Machine Learning (QML) field. In response to\nthis gap, this survey provides an integrated and comprehensive examination of\nQiML, exploring QiML's diverse research domains including tensor network\nsimulations, dequantized algorithms, and others, showcasing recent\nadvancements, practical applications, and illuminating potential future\nresearch avenues. Further, a concrete definition of QiML is established by\nanalyzing various prior interpretations of the term and their inherent\nambiguities. As QiML continues to evolve, we anticipate a wealth of future\ndevelopments drawing from quantum mechanics, quantum computing, and classical\nmachine learning, enriching the field further. This survey serves as a guide\nfor researchers and practitioners alike, providing a holistic understanding of\nQiML's current landscape and future directions.\n","authors":["Larry Huynh","Jin Hong","Ajmal Mian","Hajime Suzuki","Yanqiu Wu","Seyit Camtepe"],"pdf_url":"https://arxiv.org/pdf/2308.11269v2.pdf","comment":"59 pages, 13 figures, 9 tables. - Edited for spelling, grammar, and\n corrected minor typos in formulas - Adjusted wording in places for better\n clarity - Corrected contact info - Added Table 1 to clarify variables used in\n dequantized algs. - Added subsections in QVAS discussing QCBMs and TN-based\n VQC models - Included additional references as requested by authors to ensure\n a more exhaustive survey"},{"id":"http://arxiv.org/abs/2307.10616v2","updated":"2023-09-08T07:19:22Z","published":"2023-07-20T06:32:14Z","title":"Heterogeneous Federated Learning: State-of-the-art and Research\n Challenges","summary":" Federated learning (FL) has drawn increasing attention owing to its potential\nuse in large-scale industrial applications. Existing federated learning works\nmainly focus on model homogeneous settings. However, practical federated\nlearning typically faces the heterogeneity of data distributions, model\narchitectures, network environments, and hardware devices among participant\nclients. Heterogeneous Federated Learning (HFL) is much more challenging, and\ncorresponding solutions are diverse and complex. Therefore, a systematic survey\non this topic about the research challenges and state-of-the-art is essential.\nIn this survey, we firstly summarize the various research challenges in HFL\nfrom five aspects: statistical heterogeneity, model heterogeneity,\ncommunication heterogeneity, device heterogeneity, and additional challenges.\nIn addition, recent advances in HFL are reviewed and a new taxonomy of existing\nHFL methods is proposed with an in-depth analysis of their pros and cons. We\nclassify existing methods from three different levels according to the HFL\nprocedure: data-level, model-level, and server-level. Finally, several critical\nand promising future research directions in HFL are discussed, which may\nfacilitate further developments in this field. A periodically updated\ncollection on HFL is available at https://github.com/marswhu/HFL_Survey.\n","authors":["Mang Ye","Xiuwen Fang","Bo Du","Pong C. Yuen","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2307.10616v2.pdf","comment":"42 pages, 11 figures, and 4 tables"},{"id":"http://arxiv.org/abs/2112.08967v2","updated":"2023-09-08T07:19:08Z","published":"2021-12-16T15:35:15Z","title":"Multi-task UNet architecture for end-to-end autonomous driving","summary":" We propose an end-to-end driving model that integrates a multi-task UNet\n(MTUNet) architecture and control algorithms in a pipeline of data flow from a\nfront camera through this model to driving decisions. It provides quantitative\nmeasures to evaluate the holistic, dynamic, and real-time performance of\nend-to-end driving systems and thus the safety and interpretability of MTUNet.\nThe architecture consists of one segmentation, one regression, and two\nclassification tasks for lane segmentation, path prediction, and vehicle\ncontrols. We present three variants of the architecture having different\ncomplexities, compare them on different tasks in four static measures for both\nsingle and multiple tasks, and then identify the best one by two additional\ndynamic measures in real-time simulation. Our results show that the performance\nof the proposed supervised learning model is comparable to that of a\nreinforcement learning model on curvy roads for the same task, which is not\nend-to-end but multi-module.\n","authors":["Der-Hau Lee","Jinn-Liang Liu"],"pdf_url":"https://arxiv.org/pdf/2112.08967v2.pdf","comment":"6 pages, 5 figures, a condensation of the previous version"},{"id":"http://arxiv.org/abs/2309.04160v1","updated":"2023-09-08T07:01:38Z","published":"2023-09-08T07:01:38Z","title":"Leveraging Prototype Patient Representations with Feature-Missing-Aware\n Calibration to Mitigate EHR Data Sparsity","summary":" Electronic Health Record (EHR) data frequently exhibits sparse\ncharacteristics, posing challenges for predictive modeling. Current direct\nimputation such as matrix imputation approaches hinge on referencing analogous\nrows or columns to complete raw missing data and do not differentiate between\nimputed and actual values. As a result, models may inadvertently incorporate\nirrelevant or deceptive information with respect to the prediction objective,\nthereby compromising the efficacy of downstream performance. While some methods\nstrive to recalibrate or augment EHR embeddings after direct imputation, they\noften mistakenly prioritize imputed features. This misprioritization can\nintroduce biases or inaccuracies into the model. To tackle these issues, our\nwork resorts to indirect imputation, where we leverage prototype\nrepresentations from similar patients to obtain a denser embedding. Recognizing\nthe limitation that missing features are typically treated the same as present\nones when measuring similar patients, our approach designs a feature confidence\nlearner module. This module is sensitive to the missing feature status,\nenabling the model to better judge the reliability of each feature. Moreover,\nwe propose a novel patient similarity metric that takes feature confidence into\naccount, ensuring that evaluations are not based merely on potentially\ninaccurate imputed values. Consequently, our work captures dense prototype\npatient representations with feature-missing-aware calibration process.\nComprehensive experiments demonstrate that designed model surpasses established\nEHR-focused models with a statistically significant improvement on MIMIC-III\nand MIMIC-IV datasets in-hospital mortality outcome prediction task. The code\nis publicly available at \\url{https://anonymous.4open.science/r/SparseEHR} to\nassure the reproducibility.\n","authors":["Yinghao Zhu","Zixiang Wang","Long He","Shiyun Xie","Zixi Chen","Jingkun An","Liantao Ma","Chengwei Pan"],"pdf_url":"https://arxiv.org/pdf/2309.04160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.10495v2","updated":"2023-09-08T05:57:49Z","published":"2022-07-21T14:21:34Z","title":"Generating and Detecting True Ambiguity: A Forgotten Danger in DNN\n Supervision Testing","summary":" Deep Neural Networks (DNNs) are becoming a crucial component of modern\nsoftware systems, but they are prone to fail under conditions that are\ndifferent from the ones observed during training (out-of-distribution inputs)\nor on inputs that are truly ambiguous, i.e., inputs that admit multiple classes\nwith nonzero probability in their labels. Recent work proposed DNN supervisors\nto detect high-uncertainty inputs before their possible misclassification leads\nto any harm. To test and compare the capabilities of DNN supervisors,\nresearchers proposed test generation techniques, to focus the testing effort on\nhigh-uncertainty inputs that should be recognized as anomalous by supervisors.\nHowever, existing test generators aim to produce out-of-distribution inputs. No\nexisting model- and supervisor independent technique targets the generation of\ntruly ambiguous test inputs, i.e., inputs that admit multiple classes according\nto expert human judgment.\n In this paper, we propose a novel way to generate ambiguous inputs to test\nDNN supervisors and used it to empirically compare several existing supervisor\ntechniques. In particular, we propose AmbiGuess to generate ambiguous samples\nfor image classification problems. AmbiGuess is based on gradient-guided\nsampling in the latent space of a regularized adversarial autoencoder.\nMoreover, we conducted what is -- to the best of our knowledge -- the most\nextensive comparative study of DNN supervisors, considering their capabilities\nto detect 4 distinct types of high-uncertainty inputs, including truly\nambiguous ones. We find that the tested supervisors' capabilities are\ncomplementary: Those best suited to detect true ambiguity perform worse on\ninvalid, out-of-distribution and adversarial inputs and vice-versa.\n","authors":["Michael Weiss","André García Gómez","Paolo Tonella"],"pdf_url":"https://arxiv.org/pdf/2207.10495v2.pdf","comment":"Accepted for publication at Springers \"Empirical Software\n Engineering\" (EMSE)"},{"id":"http://arxiv.org/abs/2307.12510v3","updated":"2023-09-08T05:21:57Z","published":"2023-07-24T03:52:11Z","title":"An Empirical Evaluation of Temporal Graph Benchmark","summary":" In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark\n(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with\nTGB, we include eleven popular dynamic graph learning methods for more\nexhaustive comparisons. Through the experiments, we find that (1) different\nmodels depict varying performance across various datasets, which is in line\nwith previous observations; (2) the performance of some baselines can be\nsignificantly improved over the reported results in TGB when using DyGLib. This\nwork aims to ease the researchers' efforts in evaluating various dynamic graph\nlearning methods on TGB and attempts to offer results that can be directly\nreferenced in the follow-up research. All the used resources in this project\nare publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is\nin progress, and feedback from the community is welcomed for improvements.\n","authors":["Le Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12510v3.pdf","comment":"preprint, in progress, more results are added"},{"id":"http://arxiv.org/abs/2305.10502v2","updated":"2023-09-08T05:15:57Z","published":"2023-05-17T18:16:33Z","title":"EENED: End-to-End Neural Epilepsy Detection based on Convolutional\n Transformer","summary":" Recently Transformer and Convolution neural network (CNN) based models have\nshown promising results in EEG signal processing. Transformer models can\ncapture the global dependencies in EEG signals through a self-attention\nmechanism, while CNN models can capture local features such as sawtooth waves.\nIn this work, we propose an end-to-end neural epilepsy detection model, EENED,\nthat combines CNN and Transformer. Specifically, by introducing the convolution\nmodule into the Transformer encoder, EENED can learn the time-dependent\nrelationship of the patient's EEG signal features and notice local EEG abnormal\nmutations closely related to epilepsy, such as the appearance of spikes and the\nsprinkling of sharp and slow waves. Our proposed framework combines the ability\nof Transformer and CNN to capture different scale features of EEG signals and\nholds promise for improving the accuracy and reliability of epilepsy detection.\nOur source code will be released soon on GitHub.\n","authors":["Chenyu Liu","Xinliang Zhou","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2305.10502v2.pdf","comment":"Accepted by IEEE CAI 2023"},{"id":"http://arxiv.org/abs/2304.01333v3","updated":"2023-09-08T04:28:00Z","published":"2023-04-03T19:53:31Z","title":"Classification of integers based on residue classes via modern deep\n learning algorithms","summary":" Judging whether an integer can be divided by prime numbers such as 2 or 3 may\nappear trivial to human beings, but can be less straightforward for computers.\nHere, we tested multiple deep learning architectures and feature engineering\napproaches on classifying integers based on their residues when divided by\nsmall prime numbers. We found that the ability of classification critically\ndepends on the feature space. We also evaluated Automated Machine Learning\n(AutoML) platforms from Amazon, Google and Microsoft, and found that they\nfailed on this task without appropriately engineered features. Furthermore, we\nintroduced a method that utilizes linear regression on Fourier series basis\nvectors, and demonstrated its effectiveness. Finally, we evaluated Large\nLanguage Models (LLMs) such as GPT-4, GPT-J, LLaMA and Falcon, and demonstrated\ntheir failures. In conclusion, feature engineering remains an important task to\nimprove performance and increase interpretability of machine-learning models,\neven in the era of AutoML and LLMs.\n","authors":["Da Wu","Jingye Yang","Mian Umair Ahsan","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2304.01333v3.pdf","comment":"Accepted at Patterns"},{"id":"http://arxiv.org/abs/2309.04100v1","updated":"2023-09-08T03:41:54Z","published":"2023-09-08T03:41:54Z","title":"A Deep Learning Method for Sensitivity Enhancement of Deuterium\n Metabolic Imaging (DMI)","summary":" Purpose: Common to most MRSI techniques, the spatial resolution and the\nminimal scan duration of Deuterium Metabolic Imaging (DMI) are limited by the\nachievable SNR. This work presents a deep learning method for sensitivity\nenhancement of DMI.\n Methods: A convolutional neural network (CNN) was designed to estimate the\n2H-labeled metabolite concentrations from low SNR and distorted DMI FIDs. The\nCNN was trained with synthetic data that represent a range of SNR levels\ntypically encountered in vivo. The estimation precision was further improved by\nfine-tuning the CNN with MRI-based edge-preserving regularization for each DMI\ndataset. The proposed processing method, PReserved Edge ConvolutIonal neural\nnetwork for Sensitivity Enhanced DMI (PRECISE-DMI), was applied to simulation\nstudies and in vivo experiments to evaluate the anticipated improvements in SNR\nand investigate the potential for inaccuracies.\n Results: PRECISE-DMI visually improved the metabolic maps of low SNR\ndatasets, and quantitatively provided higher precision than the standard\nFourier reconstruction. Processing of DMI data acquired in rat brain tumor\nmodels resulted in more precise determination of 2H-labeled lactate and\nglutamate + glutamine levels, at increased spatial resolution (from >8 to 2\n$\\mu$L) or shortened scan time (from 32 to 4 min) compared to standard\nacquisitions. However, rigorous SD-bias analyses showed that overuse of the\nedge-preserving regularization can compromise the accuracy of the results.\n Conclusion: PRECISE-DMI allows a flexible trade-off between enhancing the\nsensitivity of DMI and minimizing the inaccuracies. With typical settings, the\nDMI sensitivity can be improved by 3-fold while retaining the capability to\ndetect local signal variations.\n","authors":["Siyuan Dong","Henk M. De Feyter","Monique A. Thomas","Robin A. de Graaf","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2309.04100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.11159v3","updated":"2023-09-08T03:23:37Z","published":"2022-07-22T15:55:49Z","title":"Network Revenue Management with Demand Learning and Fair\n Resource-Consumption Balancing","summary":" In addition to maximizing the total revenue, decision-makers in lots of\nindustries would like to guarantee balanced consumption across different\nresources. For instance, in the retailing industry, ensuring a balanced\nconsumption of resources from different suppliers enhances fairness and helps\nmain a healthy channel relationship; in the cloud computing industry,\nresource-consumption balance helps increase customer satisfaction and reduce\noperational costs. Motivated by these practical needs, this paper studies the\nprice-based network revenue management (NRM) problem with both demand learning\nand fair resource-consumption balancing. We introduce the regularized revenue,\ni.e., the total revenue with a balancing regularization, as our objective to\nincorporate fair resource-consumption balancing into the revenue maximization\ngoal. We propose a primal-dual-type online policy with the\nUpper-Confidence-Bound (UCB) demand learning method to maximize the regularized\nrevenue. We adopt several innovative techniques to make our algorithm a unified\nand computationally efficient framework for the continuous price set and a wide\nclass of balancing regularizers. Our algorithm achieves a worst-case regret of\n$\\widetilde O(N^{5/2}\\sqrt{T})$, where $N$ denotes the number of products and\n$T$ denotes the number of time periods. Numerical experiments in a few NRM\nexamples demonstrate the effectiveness of our algorithm in simultaneously\nachieving revenue maximization and fair resource-consumption balancing\n","authors":["Xi Chen","Jiameng Lyu","Yining Wang","Yuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2207.11159v3.pdf","comment":"Forthcoming in Production and Operations Management. The original\n title is Fairness-aware Network Revenue Management With Demand Learning"},{"id":"http://arxiv.org/abs/2309.04085v1","updated":"2023-09-08T02:54:31Z","published":"2023-09-08T02:54:31Z","title":"Sample-Efficient Co-Design of Robotic Agents Using Multi-fidelity\n Training on Universal Policy Network","summary":" Co-design involves simultaneously optimizing the controller and agents\nphysical design. Its inherent bi-level optimization formulation necessitates an\nouter loop design optimization driven by an inner loop control optimization.\nThis can be challenging when the design space is large and each design\nevaluation involves data-intensive reinforcement learning process for control\noptimization. To improve the sample-efficiency we propose a\nmulti-fidelity-based design exploration strategy based on Hyperband where we\ntie the controllers learnt across the design spaces through a universal policy\nlearner for warm-starting the subsequent controller learning problems. Further,\nwe recommend a particular way of traversing the Hyperband generated design\nmatrix that ensures that the stochasticity of the Hyperband is reduced the most\nwith the increasing warm starting effect of the universal policy learner as it\nis strengthened with each new design evaluation. Experiments performed on a\nwide range of agent design problems demonstrate the superiority of our method\ncompared to the baselines. Additionally, analysis of the optimized designs\nshows interesting design alterations including design simplifications and\nnon-intuitive alterations that have emerged in the biological world.\n","authors":["Kishan R. Nagiredla","Buddhika L. Semage","Thommen G. Karimpanal","Arun Kumar A. V","Santu Rana"],"pdf_url":"https://arxiv.org/pdf/2309.04085v1.pdf","comment":"17 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.04082v1","updated":"2023-09-08T02:44:37Z","published":"2023-09-08T02:44:37Z","title":"Curve Your Attention: Mixed-Curvature Transformers for Graph\n Representation Learning","summary":" Real-world graphs naturally exhibit hierarchical or cyclical structures that\nare unfit for the typical Euclidean space. While there exist graph neural\nnetworks that leverage hyperbolic or spherical spaces to learn representations\nthat embed such structures more accurately, these methods are confined under\nthe message-passing paradigm, making the models vulnerable against side-effects\nsuch as oversmoothing and oversquashing. More recent work have proposed global\nattention-based graph Transformers that can easily model long-range\ninteractions, but their extensions towards non-Euclidean geometry are yet\nunexplored. To bridge this gap, we propose Fully Product-Stereographic\nTransformer, a generalization of Transformers towards operating entirely on the\nproduct of constant curvature spaces. When combined with tokenized graph\nTransformers, our model can learn the curvature appropriate for the input graph\nin an end-to-end fashion, without the need of additional tuning on different\ncurvature initializations. We also provide a kernelized approach to\nnon-Euclidean attention, which enables our model to run in time and memory cost\nlinear to the number of nodes and edges while respecting the underlying\ngeometry. Experiments on graph reconstruction and node classification\ndemonstrate the benefits of generalizing Transformers to the non-Euclidean\ndomain.\n","authors":["Sungjun Cho","Seunghyuk Cho","Sungwoo Park","Hankook Lee","Honglak Lee","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2309.04082v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.04081v1","updated":"2023-09-08T02:42:40Z","published":"2023-09-08T02:42:40Z","title":"UER: A Heuristic Bias Addressing Approach for Online Continual Learning","summary":" Online continual learning aims to continuously train neural networks from a\ncontinuous data stream with a single pass-through data. As the most effective\napproach, the rehearsal-based methods replay part of previous data. Commonly\nused predictors in existing methods tend to generate biased dot-product logits\nthat prefer to the classes of current data, which is known as a bias issue and\na phenomenon of forgetting. Many approaches have been proposed to overcome the\nforgetting problem by correcting the bias; however, they still need to be\nimproved in online fashion. In this paper, we try to address the bias issue by\na more straightforward and more efficient method. By decomposing the\ndot-product logits into an angle factor and a norm factor, we empirically find\nthat the bias problem mainly occurs in the angle factor, which can be used to\nlearn novel knowledge as cosine logits. On the contrary, the norm factor\nabandoned by existing methods helps remember historical knowledge. Based on\nthis observation, we intuitively propose to leverage the norm factor to balance\nthe new and old knowledge for addressing the bias. To this end, we develop a\nheuristic approach called unbias experience replay (UER). UER learns current\nsamples only by the angle factor and further replays previous samples by both\nthe norm and angle factors. Extensive experiments on three datasets show that\nUER achieves superior performance over various state-of-the-art methods. The\ncode is in https://github.com/FelixHuiweiLin/UER.\n","authors":["Huiwei Lin","Shanshan Feng","Baoquan Zhang","Hongliang Qiao","Xutao Li","Yunming Ye"],"pdf_url":"https://arxiv.org/pdf/2309.04081v1.pdf","comment":"9 pages, 12 figures, ACM MM2023"},{"id":"http://arxiv.org/abs/2308.15452v2","updated":"2023-09-08T02:31:35Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" The reasoning capabilities of Large Language Models (LLMs) play a pivotal\nrole in the realm of embodied artificial intelligence. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.04078v1","updated":"2023-09-08T02:27:28Z","published":"2023-09-08T02:27:28Z","title":"Enabling the Evaluation of Driver Physiology Via Vehicle Dynamics","summary":" Driving is a daily routine for many individuals across the globe. This paper\npresents the configuration and methodologies used to transform a vehicle into a\nconnected ecosystem capable of assessing driver physiology. We integrated an\narray of commercial sensors from the automotive and digital health sectors\nalong with driver inputs from the vehicle itself. This amalgamation of sensors\nallows for meticulous recording of the external conditions and driving\nmaneuvers. These data streams are processed to extract key parameters,\nproviding insights into driver behavior in relation to their external\nenvironment and illuminating vital physiological responses. This innovative\ndriver evaluation system holds the potential to amplify road safety. Moreover,\nwhen paired with data from conventional health settings, it may enhance early\ndetection of health-related complications.\n","authors":["Rodrigo Ordonez-Hurtado","Bo Wen","Nicholas Barra","Ryan Vimba","Sergio Cabrero-Barros","Sergiy Zhuk","Jeffrey L. Rogers"],"pdf_url":"https://arxiv.org/pdf/2309.04078v1.pdf","comment":"7 pages, 11 figures, 2023 IEEE International Conference on Digital\n Health (ICDH)"},{"id":"http://arxiv.org/abs/2309.04072v1","updated":"2023-09-08T02:09:40Z","published":"2023-09-08T02:09:40Z","title":"Riemannian Langevin Monte Carlo schemes for sampling PSD matrices with\n fixed rank","summary":" This paper introduces two explicit schemes to sample matrices from Gibbs\ndistributions on $\\mathcal S^{n,p}_+$, the manifold of real positive\nsemi-definite (PSD) matrices of size $n\\times n$ and rank $p$. Given an energy\nfunction $\\mathcal E:\\mathcal S^{n,p}_+\\to \\mathbb{R}$ and certain Riemannian\nmetrics $g$ on $\\mathcal S^{n,p}_+$, these schemes rely on an Euler-Maruyama\ndiscretization of the Riemannian Langevin equation (RLE) with Brownian motion\non the manifold. We present numerical schemes for RLE under two fundamental\nmetrics on $\\mathcal S^{n,p}_+$: (a) the metric obtained from the embedding of\n$\\mathcal S^{n,p}_+ \\subset \\mathbb{R}^{n\\times n} $; and (b) the\nBures-Wasserstein metric corresponding to quotient geometry. We also provide\nexamples of energy functions with explicit Gibbs distributions that allow\nnumerical validation of these schemes.\n","authors":["Tianmin Yu","Shixin Zheng","Jianfeng Lu","Govind Menon","Xiangxiong Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08719v3","updated":"2023-09-08T02:06:57Z","published":"2023-06-14T19:48:30Z","title":"Off-policy Evaluation in Doubly Inhomogeneous Environments","summary":" This work aims to study off-policy evaluation (OPE) under scenarios where two\nkey reinforcement learning (RL) assumptions -- temporal stationarity and\nindividual homogeneity are both violated. To handle the ``double\ninhomogeneities\", we propose a class of latent factor models for the reward and\nobservation transition functions, under which we develop a general OPE\nframework that consists of both model-based and model-free approaches. To our\nknowledge, this is the first paper that develops statistically sound OPE\nmethods in offline RL with double inhomogeneities. It contributes to a deeper\nunderstanding of OPE in environments, where standard RL assumptions are not\nmet, and provides several practical approaches in these settings. We establish\nthe theoretical properties of the proposed value estimators and empirically\nshow that our approach outperforms competing methods that ignore either\ntemporal nonstationarity or individual heterogeneity. Finally, we illustrate\nour method on a data set from the Medical Information Mart for Intensive Care.\n","authors":["Zeyu Bian","Chengchun Shi","Zhengling Qi","Lan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.08719v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06980v4","updated":"2023-09-08T01:38:17Z","published":"2023-03-13T10:30:02Z","title":"Self-supervised learning-based general laboratory progress pretrained\n model for cardiovascular event detection","summary":" The inherent nature of patient data poses several challenges. Prevalent cases\namass substantial longitudinal data owing to their patient volume and\nconsistent follow-ups, however, longitudinal laboratory data are renowned for\ntheir irregularity, temporality, absenteeism, and sparsity; In contrast,\nrecruitment for rare or specific cases is often constrained due to their\nlimited patient size and episodic observations. This study employed\nself-supervised learning (SSL) to pretrain a generalized laboratory progress\n(GLP) model that captures the overall progression of six common laboratory\nmarkers in prevalent cardiovascular cases, with the intention of transferring\nthis knowledge to aid in the detection of specific cardiovascular event. GLP\nimplemented a two-stage training approach, leveraging the information embedded\nwithin interpolated data and amplify the performance of SSL. After GLP\npretraining, it is transferred for TVR detection. The proposed two-stage\ntraining improved the performance of pure SSL, and the transferability of GLP\nexhibited distinctiveness. After GLP processing, the classification exhibited a\nnotable enhancement, with averaged accuracy rising from 0.63 to 0.90. All\nevaluated metrics demonstrated substantial superiority (p < 0.01) compared to\nprior GLP processing. Our study effectively engages in translational\nengineering by transferring patient progression of cardiovascular laboratory\nparameters from one patient group to another, transcending the limitations of\ndata availability. The transferability of disease progression optimized the\nstrategies of examinations and treatments, and improves patient prognosis while\nusing commonly available laboratory parameters. The potential for expanding\nthis approach to encompass other diseases holds great promise.\n","authors":["Li-Chin Chen","Kuo-Hsuan Hung","Yi-Ju Tseng","Hsin-Yao Wang","Tse-Min Lu","Wei-Chieh Huang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2303.06980v4.pdf","comment":"published in IEEE Journal of Translational Engineering in Health &\n Medicine"},{"id":"http://arxiv.org/abs/2309.04062v1","updated":"2023-09-08T01:36:58Z","published":"2023-09-08T01:36:58Z","title":"3D Denoisers are Good 2D Teachers: Molecular Pretraining via Denoising\n and Cross-Modal Distillation","summary":" Pretraining molecular representations from large unlabeled data is essential\nfor molecular property prediction due to the high cost of obtaining\nground-truth labels. While there exist various 2D graph-based molecular\npretraining approaches, these methods struggle to show statistically\nsignificant gains in predictive performance. Recent work have thus instead\nproposed 3D conformer-based pretraining under the task of denoising, which led\nto promising results. During downstream finetuning, however, models trained\nwith 3D conformers require accurate atom-coordinates of previously unseen\nmolecules, which are computationally expensive to acquire at scale. In light of\nthis limitation, we propose D&D, a self-supervised molecular representation\nlearning framework that pretrains a 2D graph encoder by distilling\nrepresentations from a 3D denoiser. With denoising followed by cross-modal\nknowledge distillation, our approach enjoys use of knowledge obtained from\ndenoising as well as painless application to downstream tasks with no access to\naccurate conformers. Experiments on real-world molecular property prediction\ndatasets show that the graph encoder trained via D&D can infer 3D information\nbased on the 2D graph and shows superior performance and label-efficiency\nagainst other baselines.\n","authors":["Sungjun Cho","Dae-Woong Jeong","Sung Moon Ko","Jinwoo Kim","Sehui Han","Seunghoon Hong","Honglak Lee","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2309.04062v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2308.16375v2","updated":"2023-09-08T01:19:28Z","published":"2023-08-31T00:31:08Z","title":"A Survey on Privacy in Graph Neural Networks: Attacks, Preservation, and\n Applications","summary":" Graph Neural Networks (GNNs) have gained significant attention owing to their\nability to handle graph-structured data and the improvement in practical\napplications. However, many of these models prioritize high utility\nperformance, such as accuracy, with a lack of privacy consideration, which is a\nmajor concern in modern society where privacy attacks are rampant. To address\nthis issue, researchers have started to develop privacy-preserving GNNs.\nDespite this progress, there is a lack of a comprehensive overview of the\nattacks and the techniques for preserving privacy in the graph domain. In this\nsurvey, we aim to address this gap by summarizing the attacks on graph data\naccording to the targeted information, categorizing the privacy preservation\ntechniques in GNNs, and reviewing the datasets and applications that could be\nused for analyzing/solving privacy issues in GNNs. We also outline potential\ndirections for future research in order to build better privacy-preserving\nGNNs.\n","authors":["Yi Zhang","Yuying Zhao","Zhaoqing Li","Xueqi Cheng","Yu Wang","Olivera Kotevska","Philip S. Yu","Tyler Derr"],"pdf_url":"https://arxiv.org/pdf/2308.16375v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10617v2","updated":"2023-09-08T00:56:18Z","published":"2023-06-18T18:46:16Z","title":"GPU-Accelerated Verification of Machine Learning Models for Power\n Systems","summary":" Computational tools for rigorously verifying the performance of large-scale\nmachine learning (ML) models have progressed significantly in recent years. The\nmost successful solvers employ highly specialized, GPU-accelerated branch and\nbound routines. Such tools are crucial for the successful deployment of machine\nlearning applications in safety-critical systems, such as power systems.\nDespite their successes, however, barriers prevent out-of-the-box application\nof these routines to power system problems. This paper addresses this issue in\ntwo key ways. First, for the first time to our knowledge, we enable the\nsimultaneous verification of multiple verification problems (e.g., checking for\nthe violation of all line flow constraints simultaneously and not by solving\nindividual verification problems). For that, we introduce an exact\ntransformation that converts the \"worst-case\" violation across a set of\npotential violations to a series of ReLU-based layers that augment the original\nneural network. This allows verifiers to interpret them directly. Second, power\nsystem ML models often must be verified to satisfy power flow constraints. We\npropose a dualization procedure which encodes linear equality and inequality\nconstraints directly into the verification problem; and in a manner which is\nmathematically consistent with the specialized verification tools. To\ndemonstrate these innovations, we verify problems associated with data-driven\nsecurity constrained DC-OPF solvers. We build and test our first set of\ninnovations using the $\\alpha,\\beta$-CROWN solver, and we benchmark against\nGurobi 10.0. Our contributions achieve a speedup that can exceed 100x and allow\nhigher degrees of verification flexibility.\n","authors":["Samuel Chevalier","Ilgiz Murzakhanov","Spyros Chatzivasileiadis"],"pdf_url":"https://arxiv.org/pdf/2306.10617v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01777v4","updated":"2023-09-08T00:31:53Z","published":"2023-05-02T20:36:34Z","title":"Representation Learning via Manifold Flattening and Reconstruction","summary":" This work proposes an algorithm for explicitly constructing a pair of neural\nnetworks that linearize and reconstruct an embedded submanifold, from finite\nsamples of this manifold. Our such-generated neural networks, called Flattening\nNetworks (FlatNet), are theoretically interpretable, computationally feasible\nat scale, and generalize well to test data, a balance not typically found in\nmanifold-based learning methods. We present empirical results and comparisons\nto other models on synthetic high-dimensional manifold data and 2D image data.\nOur code is publicly available.\n","authors":["Michael Psenka","Druv Pai","Vishal Raman","Shankar Sastry","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2305.01777v4.pdf","comment":"44 pages, 19 figures"},{"id":"http://arxiv.org/abs/2302.03845v2","updated":"2023-09-08T00:23:45Z","published":"2023-02-08T02:38:26Z","title":"Two-step hyperparameter optimization method: Accelerating hyperparameter\n search by using a fraction of a training dataset","summary":" Hyperparameter optimization (HPO) is an important step in machine learning\n(ML) model development, but common practices are archaic -- primarily relying\non manual or grid searches. This is partly because adopting advanced HPO\nalgorithms introduces added complexity to the workflow, leading to longer\ncomputation times. This poses a notable challenge to ML applications, as\nsuboptimal hyperparameter selections curtail the potential of ML model\nperformance, ultimately obstructing the full exploitation of ML techniques. In\nthis article, we present a two-step HPO method as a strategic solution to\ncurbing computational demands and wait times, gleaned from practical\nexperiences in applied ML parameterization work. The initial phase involves a\npreliminary evaluation of hyperparameters on a small subset of the training\ndataset, followed by a re-evaluation of the top-performing candidate models\npost-retraining with the entire training dataset. This two-step HPO method is\nuniversally applicable across HPO search algorithms, and we argue it has\nattractive efficiency gains.\n As a case study, we present our recent application of the two-step HPO method\nto the development of neural network emulators for aerosol activation. Although\nour primary use case is a data-rich limit with many millions of samples, we\nalso find that using up to 0.0025% of the data (a few thousand samples) in the\ninitial step is sufficient to find optimal hyperparameter configurations from\nmuch more extensive sampling, achieving up to 135-times speedup. The benefits\nof this method materialize through an assessment of hyperparameters and model\nperformance, revealing the minimal model complexity required to achieve the\nbest performance. The assortment of top-performing models harvested from the\nHPO process allows us to choose a high-performing model with a low inference\ncost for efficient use in global climate models (GCMs).\n","authors":["Sungduk Yu","Mike Pritchard","Po-Lun Ma","Balwinder Singh","Sam Silva"],"pdf_url":"https://arxiv.org/pdf/2302.03845v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.04420v1","updated":"2023-09-08T16:32:47Z","published":"2023-09-08T16:32:47Z","title":"Parallel and Limited Data Voice Conversion Using Stochastic Variational\n Deep Kernel Learning","summary":" Typically, voice conversion is regarded as an engineering problem with\nlimited training data. The reliance on massive amounts of data hinders the\npractical applicability of deep learning approaches, which have been\nextensively researched in recent years. On the other hand, statistical methods\nare effective with limited data but have difficulties in modelling complex\nmapping functions. This paper proposes a voice conversion method that works\nwith limited data and is based on stochastic variational deep kernel learning\n(SVDKL). At the same time, SVDKL enables the use of deep neural networks'\nexpressive capability as well as the high flexibility of the Gaussian process\nas a Bayesian and non-parametric method. When the conventional kernel is\ncombined with the deep neural network, it is possible to estimate non-smooth\nand more complex functions. Furthermore, the model's sparse variational\nGaussian process solves the scalability problem and, unlike the exact Gaussian\nprocess, allows for the learning of a global mapping function for the entire\nacoustic space. One of the most important aspects of the proposed scheme is\nthat the model parameters are trained using marginal likelihood optimization,\nwhich considers both data fitting and model complexity. Considering the\ncomplexity of the model reduces the amount of training data by increasing the\nresistance to overfitting. To evaluate the proposed scheme, we examined the\nmodel's performance with approximately 80 seconds of training data. The results\nindicated that our method obtained a higher mean opinion score, smaller\nspectral distortion, and better preference tests than the compared methods.\n","authors":["Mohamadreza Jafaryani","Hamid Sheikhzadeh","Vahid Pourahmadi"],"pdf_url":"https://arxiv.org/pdf/2309.04420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04084v1","updated":"2023-09-08T02:50:54Z","published":"2023-09-08T02:50:54Z","title":"Towards Efficient SDRTV-to-HDRTV by Learning from Image Formation","summary":" Modern displays are capable of rendering video content with high dynamic\nrange (HDR) and wide color gamut (WCG). However, the majority of available\nresources are still in standard dynamic range (SDR). As a result, there is\nsignificant value in transforming existing SDR content into the HDRTV standard.\nIn this paper, we define and analyze the SDRTV-to-HDRTV task by modeling the\nformation of SDRTV/HDRTV content. Our analysis and observations indicate that a\nnaive end-to-end supervised training pipeline suffers from severe gamut\ntransition errors. To address this issue, we propose a novel three-step\nsolution pipeline called HDRTVNet++, which includes adaptive global color\nmapping, local enhancement, and highlight refinement. The adaptive global color\nmapping step uses global statistics as guidance to perform image-adaptive color\nmapping. A local enhancement network is then deployed to enhance local details.\nFinally, we combine the two sub-networks above as a generator and achieve\nhighlight consistency through GAN-based joint training. Our method is primarily\ndesigned for ultra-high-definition TV content and is therefore effective and\nlightweight for processing 4K resolution images. We also construct a dataset\nusing HDR videos in the HDR10 standard, named HDRTV1K that contains 1235 and\n117 training images and 117 testing images, all in 4K resolution. Besides, we\nselect five metrics to evaluate the results of SDRTV-to-HDRTV algorithms. Our\nfinal results demonstrate state-of-the-art performance both quantitatively and\nvisually. The code, model and dataset are available at\nhttps://github.com/xiaom233/HDRTVNet-plus.\n","authors":["Xiangyu Chen","Zheyuan Li","Zhengwen Zhang","Jimmy S. Ren","Yihao Liu","Jingwen He","Yu Qiao","Jiantao Zhou","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.04084v1.pdf","comment":"Extended version of HDRTVNet"},{"id":"http://arxiv.org/abs/2308.07056v5","updated":"2023-09-08T01:51:10Z","published":"2023-08-14T10:31:29Z","title":"VoxBlink: A Large Scale Speaker Verification Dataset on Camera","summary":" In this paper, we introduce a large-scale and high-quality audio-visual\nspeaker verification dataset, named VoxBlink. We propose an innovative and\nrobust automatic audio-visual data mining pipeline to curate this dataset,\nwhich contains 1.45M utterances from 38K speakers. Due to the inherent nature\nof automated data collection, introducing noisy data is inevitable. Therefore,\nwe also utilize a multi-modal purification to generate a cleaner version of the\nVoxBlink, named VoxBlink-clean, comprising 18K identities and 1.02M utterances.\nIn contrast to the VoxCeleb, the VoxBlink sources from short videos of ordinary\nusers, and the covered scenarios can better align with real-life situations. To\nour best knowledge, the VoxBlink dataset is one of the largest publicly\navailable speaker verification datasets. Leveraging the VoxCeleb and\nVoxBlink-clean datasets together, we employ diverse speaker verification models\nwith multiple architectural backbones to conduct comprehensive experimentation\non the VoxCeleb test sets. Experimental results indicate a substantial\nenhancement in performance-ranging from 12% to 30% relatively-across various\nbackbone architectures upon incorporating the VoxBlink-clean into the training\nprocess. The details of the dataset can be found in http://voxblink.github.io\n","authors":["Yuke Lin","Xiaoyi Qin","Guoqing Zhao","Ming Cheng","Ning Jiang","Haiyang Wu","Ming Li"],"pdf_url":"https://arxiv.org/pdf/2308.07056v5.pdf","comment":"submit to ICASSP2024"},{"id":"http://arxiv.org/abs/2309.04608v1","updated":"2023-09-08T21:51:11Z","published":"2023-09-08T21:51:11Z","title":"Style Generation: Image Synthesis based on Coarsely Matched Texts","summary":" Previous text-to-image synthesis algorithms typically use explicit textual\ninstructions to generate/manipulate images accurately, but they have difficulty\nadapting to guidance in the form of coarsely matched texts. In this work, we\nattempt to stylize an input image using such coarsely matched text as guidance.\nTo tackle this new problem, we introduce a novel task called text-based style\ngeneration and propose a two-stage generative adversarial network: the first\nstage generates the overall image style with a sentence feature, and the second\nstage refines the generated style with a synthetic feature, which is produced\nby a multi-modality style synthesis module. We re-filter one existing dataset\nand collect a new dataset for the task. Extensive experiments and ablation\nstudies are conducted to validate our framework. The practical potential of our\nwork is demonstrated by various applications such as text-image alignment and\nstory visualization. Our datasets are published at\nhttps://www.kaggle.com/datasets/mengyaocui/style-generation.\n","authors":["Mengyao Cui","Zhe Zhu","Shao-Ping Lu","Yulu Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04549v1","updated":"2023-09-08T18:34:48Z","published":"2023-09-08T18:34:48Z","title":"Poster: Making Edge-assisted LiDAR Perceptions Robust to Lossy Point\n Cloud Compression","summary":" Real-time light detection and ranging (LiDAR) perceptions, e.g., 3D object\ndetection and simultaneous localization and mapping are computationally\nintensive to mobile devices of limited resources and often offloaded on the\nedge. Offloading LiDAR perceptions requires compressing the raw sensor data,\nand lossy compression is used for efficiently reducing the data volume. Lossy\ncompression degrades the quality of LiDAR point clouds, and the perception\nperformance is decreased consequently. In this work, we present an\ninterpolation algorithm improving the quality of a LiDAR point cloud to\nmitigate the perception performance loss due to lossy compression. The\nalgorithm targets the range image (RI) representation of a point cloud and\ninterpolates points at the RI based on depth gradients. Compared to existing\nimage interpolation algorithms, our algorithm shows a better qualitative result\nwhen the point cloud is reconstructed from the interpolated RI. With the\npreliminary results, we also describe the next steps of the current work.\n","authors":["Jin Heo","Gregorie Phillips","Per-Erik Brodin","Ada Gavrilovska"],"pdf_url":"https://arxiv.org/pdf/2309.04549v1.pdf","comment":"extended abstract of 2 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.04548v1","updated":"2023-09-08T18:34:34Z","published":"2023-09-08T18:34:34Z","title":"Poster: Enabling Flexible Edge-assisted XR","summary":" Extended reality (XR) is touted as the next frontier of the digital future.\nXR includes all immersive technologies of augmented reality (AR), virtual\nreality (VR), and mixed reality (MR). XR applications obtain the real-world\ncontext of the user from an underlying system, and provide rich, immersive, and\ninteractive virtual experiences based on the user's context in real-time. XR\nsystems process streams of data from device sensors, and provide\nfunctionalities including perceptions and graphics required by the\napplications. These processing steps are computationally intensive, and the\nchallenge is that they must be performed within the strict latency requirements\nof XR. This poses limitations on the possible XR experiences that can be\nsupported on mobile devices with limited computing resources.\n In this XR context, edge computing is an effective approach to address this\nproblem for mobile users. The edge is located closer to the end users and\nenables processing and storing data near them. In addition, the development of\nhigh bandwidth and low latency network technologies such as 5G facilitates the\napplication of edge computing for latency-critical use cases [4, 11]. This work\npresents an XR system for enabling flexible edge-assisted XR.\n","authors":["Jin Heo","Ketan Bhardwaj","Ada Gavrilovska"],"pdf_url":"https://arxiv.org/pdf/2309.04548v1.pdf","comment":"extended abstract of 2 pages, 1 figure, 2 tables"}]},"2023-09-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.05660v1","updated":"2023-09-11T17:56:57Z","published":"2023-09-11T17:56:57Z","title":"Hypothesis Search: Inductive Reasoning with Language Models","summary":" Inductive reasoning is a core problem-solving capacity: humans can identify\nunderlying principles from a few examples, which can then be robustly\ngeneralized to novel scenarios. Recent work has evaluated large language models\n(LLMs) on inductive reasoning tasks by directly prompting them yielding \"in\ncontext learning.\" This can work well for straightforward inductive tasks, but\nperforms very poorly on more complex tasks such as the Abstraction and\nReasoning Corpus (ARC). In this work, we propose to improve the inductive\nreasoning ability of LLMs by generating explicit hypotheses at multiple levels\nof abstraction: we prompt the LLM to propose multiple abstract hypotheses about\nthe problem, in natural language, then implement the natural language\nhypotheses as concrete Python programs. These programs can be directly verified\nby running on the observed examples and generalized to novel inputs. Because of\nthe prohibitive cost of generation with state-of-the-art LLMs, we consider a\nmiddle step to filter the set of hypotheses that will be implemented into\nprograms: we either ask the LLM to summarize into a smaller set of hypotheses,\nor ask human annotators to select a subset of the hypotheses. We verify our\npipeline's effectiveness on the ARC visual inductive reasoning benchmark, its\nvariant 1D-ARC, and string transformation dataset SyGuS. On a random 40-problem\nsubset of ARC, our automated pipeline using LLM summaries achieves 27.5%\naccuracy, significantly outperforming the direct prompting baseline (accuracy\nof 12.5%). With the minimal human input of selecting from LLM-generated\ncandidates, the performance is boosted to 37.5%. (And we argue this is a lower\nbound on the performance of our approach without filtering.) Our ablation\nstudies show that abstract hypothesis generation and concrete program\nrepresentations are both beneficial for LLMs to perform inductive reasoning\ntasks.\n","authors":["Ruocheng Wang","Eric Zelikman","Gabriel Poesia","Yewen Pu","Nick Haber","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2309.05660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04940v2","updated":"2023-09-11T17:56:10Z","published":"2023-05-08T16:47:28Z","title":"The EarlyBIRD Catches the Bug: On Exploiting Early Layers of Encoder\n Models for More Efficient Code Classification","summary":" The use of modern Natural Language Processing (NLP) techniques has shown to\nbe beneficial for software engineering tasks, such as vulnerability detection\nand type inference. However, training deep NLP models requires significant\ncomputational resources. This paper explores techniques that aim at achieving\nthe best usage of resources and available information in these models.\n We propose a generic approach, EarlyBIRD, to build composite representations\nof code from the early layers of a pre-trained transformer model. We\nempirically investigate the viability of this approach on the CodeBERT model by\ncomparing the performance of 12 strategies for creating composite\nrepresentations with the standard practice of only using the last encoder\nlayer.\n Our evaluation on four datasets shows that several early layer combinations\nyield better performance on defect detection, and some combinations improve\nmulti-class classification. More specifically, we obtain a +2 average\nimprovement of detection accuracy on Devign with only 3 out of 12 layers of\nCodeBERT and a 3.3x speed-up of fine-tuning. These findings show that early\nlayers can be used to obtain better results using the same resources, as well\nas to reduce resource usage during fine-tuning and inference.\n","authors":["Anastasiia Grishina","Max Hort","Leon Moonen"],"pdf_url":"https://arxiv.org/pdf/2305.04940v2.pdf","comment":"The content in this pre-print is the same as in the CRC accepted for\n publication in the ACM Joint European Software Engineering Conference and\n Symposium on the Foundations of Software Engineering (ESEC/FSE 2023)"},{"id":"http://arxiv.org/abs/2309.05653v1","updated":"2023-09-11T17:47:22Z","published":"2023-09-11T17:47:22Z","title":"MAmmoTH: Building Math Generalist Models through Hybrid Instruction\n Tuning","summary":" We introduce MAmmoTH, a series of open-source large language models (LLMs)\nspecifically tailored for general math problem-solving. The MAmmoTH models are\ntrained on MathInstruct, our meticulously curated instruction tuning dataset.\nMathInstruct is compiled from 13 math datasets with intermediate rationales,\nsix of which have rationales newly curated by us. It presents a unique hybrid\nof chain-of-thought (CoT) and program-of-thought (PoT) rationales, and also\nensures extensive coverage of diverse fields in math. The hybrid of CoT and PoT\nnot only unleashes the potential of tool use but also allows different thought\nprocesses for different math problems. As a result, the MAmmoTH series\nsubstantially outperform existing open-source models on nine mathematical\nreasoning datasets across all scales with an average accuracy gain between 13%\nand 29%. Remarkably, our MAmmoTH-7B model reaches 35% on MATH (a\ncompetition-level dataset), which exceeds the best open-source 7B model\n(WizardMath) by 25%, and the MAmmoTH-34B model achieves 46% accuracy on MATH,\neven surpassing GPT-4's CoT result. Our work underscores the importance of\ndiverse problem coverage and the use of hybrid rationales in developing\nsuperior math generalist models.\n","authors":["Xiang Yue","Xingwei Qu","Ge Zhang","Yao Fu","Wenhao Huang","Huan Sun","Yu Su","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05653v1.pdf","comment":"Work in progress; Xiang Yue and Wenhu Chen contributed equally to\n this paper"},{"id":"http://arxiv.org/abs/2307.15217v2","updated":"2023-09-11T17:25:24Z","published":"2023-07-27T22:29:25Z","title":"Open Problems and Fundamental Limitations of Reinforcement Learning from\n Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) is a technique for training\nAI systems to align with human goals. RLHF has emerged as the central method\nused to finetune state-of-the-art large language models (LLMs). Despite this\npopularity, there has been relatively little public work systematizing its\nflaws. In this paper, we (1) survey open problems and fundamental limitations\nof RLHF and related methods; (2) overview techniques to understand, improve,\nand complement RLHF in practice; and (3) propose auditing and disclosure\nstandards to improve societal oversight of RLHF systems. Our work emphasizes\nthe limitations of RLHF and highlights the importance of a multi-faceted\napproach to the development of safer AI systems.\n","authors":["Stephen Casper","Xander Davies","Claudia Shi","Thomas Krendl Gilbert","Jérémy Scheurer","Javier Rando","Rachel Freedman","Tomasz Korbak","David Lindner","Pedro Freire","Tony Wang","Samuel Marks","Charbel-Raphaël Segerie","Micah Carroll","Andi Peng","Phillip Christoffersen","Mehul Damani","Stewart Slocum","Usman Anwar","Anand Siththaranjan","Max Nadeau","Eric J. Michaud","Jacob Pfau","Dmitrii Krasheninnikov","Xin Chen","Lauro Langosco","Peter Hase","Erdem Bıyık","Anca Dragan","David Krueger","Dorsa Sadigh","Dylan Hadfield-Menell"],"pdf_url":"https://arxiv.org/pdf/2307.15217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15772v2","updated":"2023-09-11T17:22:09Z","published":"2023-08-30T05:41:29Z","title":"Task-Based MoE for Multitask Multilingual Machine Translation","summary":" Mixture-of-experts (MoE) architecture has been proven a powerful method for\ndiverse tasks in training deep models in many applications. However, current\nMoE implementations are task agnostic, treating all tokens from different tasks\nin the same manner. In this work, we instead design a novel method that\nincorporates task information into MoE models at different granular levels with\nshared dynamic task-based adapters. Our experiments and analysis show the\nadvantages of our approaches over the dense and canonical MoE models on\nmulti-task multilingual machine translations. With task-specific adapters, our\nmodels can additionally generalize to new tasks efficiently.\n","authors":["Hai Pham","Young Jin Kim","Subhabrata Mukherjee","David P. Woodruff","Barnabas Poczos","Hany Hassan Awadalla"],"pdf_url":"https://arxiv.org/pdf/2308.15772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05619v1","updated":"2023-09-11T17:07:01Z","published":"2023-09-11T17:07:01Z","title":"Effective Proxy for Human Labeling: Ensemble Disagreement Scores in\n Large Language Models for Industrial NLP","summary":" Large language models (LLMs) have demonstrated significant capability to\ngeneralize across a large number of NLP tasks. For industry applications, it is\nimperative to assess the performance of the LLM on unlabeled production data\nfrom time to time to validate for a real-world setting. Human labeling to\nassess model error requires considerable expense and time delay. Here we\ndemonstrate that ensemble disagreement scores work well as a proxy for human\nlabeling for language models in zero-shot, few-shot, and fine-tuned settings,\nper our evaluation on keyphrase extraction (KPE) task. We measure fidelity of\nthe results by comparing to true error measured from human labeled ground\ntruth. We contrast with the alternative of using another LLM as a source of\nmachine labels, or silver labels. Results across various languages and domains\nshow disagreement scores provide a better estimation of model performance with\nmean average error (MAE) as low as 0.4% and on average 13.8% better than using\nsilver labels.\n","authors":["Wei Du","Laksh Advani","Yashmeet Gambhir","Daniel J Perry","Prashant Shiralkar","Zhengzheng Xing","Aaron Colak"],"pdf_url":"https://arxiv.org/pdf/2309.05619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17546v3","updated":"2023-09-11T16:58:48Z","published":"2022-10-31T17:57:55Z","title":"Preventing Verbatim Memorization in Language Models Gives a False Sense\n of Privacy","summary":" Studying data memorization in neural language models helps us understand the\nrisks (e.g., to privacy or copyright) associated with models regurgitating\ntraining data and aids in the development of countermeasures. Many prior works\n-- and some recently deployed defenses -- focus on \"verbatim memorization\",\ndefined as a model generation that exactly matches a substring from the\ntraining set. We argue that verbatim memorization definitions are too\nrestrictive and fail to capture more subtle forms of memorization.\nSpecifically, we design and implement an efficient defense that perfectly\nprevents all verbatim memorization. And yet, we demonstrate that this \"perfect\"\nfilter does not prevent the leakage of training data. Indeed, it is easily\ncircumvented by plausible and minimally modified \"style-transfer\" prompts --\nand in some cases even the non-modified original prompts -- to extract\nmemorized information. We conclude by discussing potential alternative\ndefinitions and why defining memorization is a difficult yet crucial open\nquestion for neural language models.\n","authors":["Daphne Ippolito","Florian Tramèr","Milad Nasr","Chiyuan Zhang","Matthew Jagielski","Katherine Lee","Christopher A. Choquette-Choo","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2210.17546v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05608v1","updated":"2023-09-11T16:47:01Z","published":"2023-09-11T16:47:01Z","title":"Incorporating Pre-trained Model Prompting in Multimodal Stock Volume\n Movement Prediction","summary":" Multimodal stock trading volume movement prediction with stock-related news\nis one of the fundamental problems in the financial area. Existing multimodal\nworks that train models from scratch face the problem of lacking universal\nknowledge when modeling financial news. In addition, the models ability may be\nlimited by the lack of domain-related knowledge due to insufficient data in the\ndatasets. To handle this issue, we propose the Prompt-based MUltimodal Stock\nvolumE prediction model (ProMUSE) to process text and time series modalities.\nWe use pre-trained language models for better comprehension of financial news\nand adopt prompt learning methods to leverage their capability in universal\nknowledge to model textual information. Besides, simply fusing two modalities\ncan cause harm to the unimodal representations. Thus, we propose a novel\ncross-modality contrastive alignment while reserving the unimodal heads beside\nthe fusion head to mitigate this problem. Extensive experiments demonstrate\nthat our proposed ProMUSE outperforms existing baselines. Comprehensive\nanalyses further validate the effectiveness of our architecture compared to\npotential variants and learning mechanisms.\n","authors":["Ruibo Chen","Zhiyuan Zhang","Yi Liu","Ruihan Bao","Keiko Harimoto","Xu Sun"],"pdf_url":"https://arxiv.org/pdf/2309.05608v1.pdf","comment":"9 pages, 3 figures, 7 tables. Accepted by 2023 KDD Workshop on\n Machine Learning in Finance"},{"id":"http://arxiv.org/abs/2309.05605v1","updated":"2023-09-11T16:39:30Z","published":"2023-09-11T16:39:30Z","title":"Memory Injections: Correcting Multi-Hop Reasoning Failures during\n Inference in Transformer-Based Language Models","summary":" Answering multi-hop reasoning questions requires retrieving and synthesizing\ninformation from diverse sources. Large Language Models (LLMs) struggle to\nperform such reasoning consistently. Here we propose an approach to pinpoint\nand rectify multi-hop reasoning failures through targeted memory injections on\nLLM attention heads. First, we analyze the per-layer activations of GPT-2\nmodels in response to single and multi-hop prompts. We then propose a mechanism\nthat allows users to inject pertinent prompt-specific information, which we\nrefer to as \"memories,\" at critical LLM locations during inference. By thus\nenabling the LLM to incorporate additional relevant information during\ninference, we enhance the quality of multi-hop prompt completions. We show\nempirically that a simple, efficient, and targeted memory injection into a key\nattention layer can often increase the probability of the desired next token in\nmulti-hop tasks, by up to 424%.\n","authors":["Mansi Sakarvadia","Aswathy Ajith","Arham Khan","Daniel Grzenda","Nathaniel Hudson","André Bauer","Kyle Chard","Ian Foster"],"pdf_url":"https://arxiv.org/pdf/2309.05605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05569v1","updated":"2023-09-11T15:54:30Z","published":"2023-09-11T15:54:30Z","title":"ITI-GEN: Inclusive Text-to-Image Generation","summary":" Text-to-image generative models often reflect the biases of the training\ndata, leading to unequal representations of underrepresented groups. This study\ninvestigates inclusive text-to-image generative models that generate images\nbased on human-written prompts and ensure the resulting images are uniformly\ndistributed across attributes of interest. Unfortunately, directly expressing\nthe desired attributes in the prompt often leads to sub-optimal results due to\nlinguistic ambiguity or model misrepresentation. Hence, this paper proposes a\ndrastically different approach that adheres to the maxim that \"a picture is\nworth a thousand words\". We show that, for some attributes, images can\nrepresent concepts more expressively than text. For instance, categories of\nskin tones are typically hard to specify by text but can be easily represented\nby example images. Building upon these insights, we propose a novel approach,\nITI-GEN, that leverages readily available reference images for Inclusive\nText-to-Image GENeration. The key idea is learning a set of prompt embeddings\nto generate images that can effectively represent all desired attribute\ncategories. More importantly, ITI-GEN requires no model fine-tuning, making it\ncomputationally efficient to augment existing text-to-image models. Extensive\nexperiments demonstrate that ITI-GEN largely improves over state-of-the-art\nmodels to generate inclusive images from a prompt. Project page:\nhttps://czhang0528.github.io/iti-gen.\n","authors":["Cheng Zhang","Xuanbai Chen","Siqi Chai","Chen Henry Wu","Dmitry Lagun","Thabo Beeler","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2309.05569v1.pdf","comment":"Accepted to ICCV 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2309.05557v1","updated":"2023-09-11T15:45:40Z","published":"2023-09-11T15:45:40Z","title":"An Empirical Study of NetOps Capability of Pre-Trained Large Language\n Models","summary":" Large language models (LLMs) can respond to human language queries and have\nshown powerful potential applications in network operations (NetOps). Thanks to\nthe large amount of commonsense knowledge inherent, LLMs achieve much better\ninference accuracy than traditional models and emerge with strong abilities in\ngeneralization, reasoning, and code generation. These abilities may have a\ncrucial boost to automated and intelligent NetOps. However, it remains\nunder-explored how well LLMs perform in various NetOps tasks. In this work, we\nmake a systematic assessment of the capabilities, strengths, and limitations of\nselected LLMs in the field of NetOps. The evaluation is conducted on a\ncollection of 5,732 questions about NetOps, encompassing 26 publicly available\ngeneral-domain LLMs, including ChatGPT, LLaMA, Falcon, etc. We also finetune\nsome of these LLMs with our collected NetOps corpus and evaluate the resulting\nmodels. The evaluation method follows the widely adopted benchmarks for\ngeneral-domain LLMs, combined with Chain-of-Thought Prompts and\nRetrieval-Augmented Generation. The results show that only GPT-4 achieves high\naccuracy equivalent to passing the NetOps certification exam for humans, while\nall the other LLMs have much lower accuracy. However, some open models like\nLLaMA 2 still demonstrate significant potential. Furthermore, we evaluate the\nimpact of factors such as model parameters, prompt engineering, instruction\nfine-tuning etc. This work shall be treated as the initial effort to systematic\nevaluation of LLMs in NetOps, and a more rigorous study is required for\nproduction use. The evaluation code and dataset will be released to benefit\nfuture research.\n","authors":["Yukai Miao","Yu Bai","Li Chen","Dan Li","Haifeng Sun","Xizheng Wang","Ziqiu Luo","Dapeng Sun","Xiuting Xu"],"pdf_url":"https://arxiv.org/pdf/2309.05557v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.04553v2","updated":"2023-09-11T15:37:33Z","published":"2021-07-09T17:11:54Z","title":"Can Deep Neural Networks Predict Data Correlations from Column Names?","summary":" Recent publications suggest using natural language analysis on database\nschema elements to guide tuning and profiling efforts. The underlying\nhypothesis is that state-of-the-art language processing methods, so-called\nlanguage models, are able to extract information on data properties from schema\ntext.\n This paper examines that hypothesis in the context of data correlation\nanalysis: is it possible to find column pairs with correlated data by analyzing\ntheir names via language models? First, the paper introduces a novel benchmark\nfor data correlation analysis, created by analyzing thousands of Kaggle data\nsets (and available for download). Second, it uses that data to study the\nability of language models to predict correlation, based on column names. The\nanalysis covers different language models, various correlation metrics, and a\nmultitude of accuracy metrics. It pinpoints factors that contribute to\nsuccessful predictions, such as the length of column names as well as the ratio\nof words. Finally, \\rev{the study analyzes the impact of column types on\nprediction performance.} The results show that schema text can be a useful\nsource of information and inform future research efforts, targeted at\nNLP-enhanced database tuning and data profiling.\n","authors":["Immanuel Trummer"],"pdf_url":"https://arxiv.org/pdf/2107.04553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05542v1","updated":"2023-09-11T15:27:59Z","published":"2023-09-11T15:27:59Z","title":"Kani: A Lightweight and Highly Hackable Framework for Building Language\n Model Applications","summary":" Language model applications are becoming increasingly popular and complex,\noften including features like tool usage and retrieval augmentation. However,\nexisting frameworks for such applications are often opinionated, deciding for\ndevelopers how their prompts ought to be formatted and imposing limitations on\ncustomizability and reproducibility. To solve this we present Kani: a\nlightweight, flexible, and model-agnostic open-source framework for building\nlanguage model applications. Kani helps developers implement a variety of\ncomplex features by supporting the core building blocks of chat interaction:\nmodel interfacing, chat management, and robust function calling. All Kani core\nfunctions are easily overridable and well documented to empower developers to\ncustomize functionality for their own needs. Kani thus serves as a useful tool\nfor researchers, hobbyists, and industry professionals alike to accelerate\ntheir development while retaining interoperability and fine-grained control.\n","authors":["Andrew Zhu","Liam Dugan","Alyssa Hwang","Chris Callison-Burch"],"pdf_url":"https://arxiv.org/pdf/2309.05542v1.pdf","comment":"In submission to NLP-OSS"},{"id":"http://arxiv.org/abs/2309.05534v1","updated":"2023-09-11T15:18:28Z","published":"2023-09-11T15:18:28Z","title":"PAI-Diffusion: Constructing and Serving a Family of Open Chinese\n Diffusion Models for Text-to-image Synthesis on the Cloud","summary":" Text-to-image synthesis for the Chinese language poses unique challenges due\nto its large vocabulary size, and intricate character relationships. While\nexisting diffusion models have shown promise in generating images from textual\ndescriptions, they often neglect domain-specific contexts and lack robustness\nin handling the Chinese language. This paper introduces PAI-Diffusion, a\ncomprehensive framework that addresses these limitations. PAI-Diffusion\nincorporates both general and domain-specific Chinese diffusion models,\nenabling the generation of contextually relevant images. It explores the\npotential of using LoRA and ControlNet for fine-grained image style transfer\nand image editing, empowering users with enhanced control over image\ngeneration. Moreover, PAI-Diffusion seamlessly integrates with Alibaba Cloud's\nMachine Learning Platform for AI, providing accessible and scalable solutions.\nAll the Chinese diffusion model checkpoints, LoRAs, and ControlNets, including\ndomain-specific ones, are publicly available. A user-friendly Chinese WebUI and\nthe diffusers-api elastic inference toolkit, also open-sourced, further\nfacilitate the easy deployment of PAI-Diffusion models in various environments,\nmaking it a valuable resource for Chinese text-to-image synthesis.\n","authors":["Chengyu Wang","Zhongjie Duan","Bingyan Liu","Xinyi Zou","Cen Chen","Kui Jia","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2309.05534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05519v1","updated":"2023-09-11T15:02:25Z","published":"2023-09-11T15:02:25Z","title":"NExT-GPT: Any-to-Any Multimodal LLM","summary":" While recently Multimodal Large Language Models (MM-LLMs) have made exciting\nstrides, they mostly fall prey to the limitation of only input-side multimodal\nunderstanding, without the ability to produce content in multiple modalities.\nAs we humans always perceive the world and communicate with people through\nvarious modalities, developing any-to-any MM-LLMs capable of accepting and\ndelivering content in any modality becomes essential to human-level AI. To fill\nthe gap, we present an end-to-end general-purpose any-to-any MM-LLM system,\nNExT-GPT. We connect an LLM with multimodal adaptors and different diffusion\ndecoders, enabling NExT-GPT to perceive inputs and generate outputs in\narbitrary combinations of text, images, videos, and audio. By leveraging the\nexisting well-trained highly-performing encoders and decoders, NExT-GPT is\ntuned with only a small amount of parameter (1%) of certain projection layers,\nwhich not only benefits low-cost training and also facilitates convenient\nexpansion to more potential modalities. Moreover, we introduce a\nmodality-switching instruction tuning (MosIT) and manually curate a\nhigh-quality dataset for MosIT, based on which NExT-GPT is empowered with\ncomplex cross-modal semantic understanding and content generation. Overall, our\nresearch showcases the promising possibility of building an AI agent capable of\nmodeling universal modalities, paving the way for more human-like AI research\nin the community.\n","authors":["Shengqiong Wu","Hao Fei","Leigang Qu","Wei Ji","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.05519v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2309.05516v1","updated":"2023-09-11T14:58:23Z","published":"2023-09-11T14:58:23Z","title":"Optimize Weight Rounding via Signed Gradient Descent for the\n Quantization of LLMs","summary":" Large Language Models (LLMs) have proven their exceptional capabilities in\nperforming language-related tasks. However, their deployment poses significant\nchallenges due to their considerable memory and storage requirements. In\nresponse to this issue, weight-only quantization, particularly 3 and 4-bit\nweight-only quantization, has emerged as one of the most viable solutions. As\nthe number of bits decreases, the quantization grid broadens, thus emphasizing\nthe importance of up and down rounding. While previous studies have\ndemonstrated that fine-tuning up and down rounding with the addition of\nperturbations can enhance accuracy in some scenarios, our study is driven by\nthe precise and limited boundary of these perturbations, where only the\nthreshold for altering the rounding value is of significance. Consequently, we\npropose a concise and highly effective approach for optimizing the weight\nrounding task. Our method, named SignRound, involves lightweight block-wise\ntuning using signed gradient descent, enabling us to achieve outstanding\nresults within 400 steps. SignRound outperforms the established baseline of\nrounding-to-nearest (RTN) and competes impressively against recent methods,\nwithout introducing additional inference overhead. The source code will be\npublicly available at https://github.com/intel/neural-compressor soon.\n","authors":["Wenhua Cheng","Weiwei Zhang","Haihao Shen","Yiyang Cai","Xin He","Kaokao Lv"],"pdf_url":"https://arxiv.org/pdf/2309.05516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05503v1","updated":"2023-09-11T14:45:24Z","published":"2023-09-11T14:45:24Z","title":"Long-Range Transformer Architectures for Document Understanding","summary":" Since their release, Transformers have revolutionized many fields from\nNatural Language Understanding to Computer Vision. Document Understanding (DU)\nwas not left behind with first Transformer based models for DU dating from late\n2019. However, the computational complexity of the self-attention operation\nlimits their capabilities to small sequences. In this paper we explore multiple\nstrategies to apply Transformer based models to long multi-page documents. We\nintroduce 2 new multi-modal (text + layout) long-range models for DU. They are\nbased on efficient implementations of Transformers for long sequences.\nLong-range models can process whole documents at once effectively and are less\nimpaired by the document's length. We compare them to LayoutLM, a classical\nTransformer adapted for DU and pre-trained on millions of documents. We further\npropose 2D relative attention bias to guide self-attention towards relevant\ntokens without harming model efficiency. We observe improvements on multi-page\nbusiness documents on Information Retrieval for a small performance cost on\nsmaller sequences. Relative 2D attention revealed to be effective on dense text\nfor both normal and long-range models.\n","authors":["Thibault Douzon","Stefan Duffner","Christophe Garcia","Jérémy Espinas"],"pdf_url":"https://arxiv.org/pdf/2309.05503v1.pdf","comment":"Conference: ICDAR 2023 Workshops on Document Analysis and Recognition"},{"id":"http://arxiv.org/abs/2309.05501v1","updated":"2023-09-11T14:43:54Z","published":"2023-09-11T14:43:54Z","title":"Black-Box Analysis: GPTs Across Time in Legal Textual Entailment Task","summary":" The evolution of Generative Pre-trained Transformer (GPT) models has led to\nsignificant advancements in various natural language processing applications,\nparticularly in legal textual entailment. We present an analysis of GPT-3.5\n(ChatGPT) and GPT-4 performances on COLIEE Task 4 dataset, a prominent\nbenchmark in this domain. The study encompasses data from Heisei 18 (2006) to\nReiwa 3 (2021), exploring the models' abilities to discern entailment\nrelationships within Japanese statute law across different periods. Our\npreliminary experimental results unveil intriguing insights into the models'\nstrengths and weaknesses in handling legal textual entailment tasks, as well as\nthe patterns observed in model performance. In the context of proprietary\nmodels with undisclosed architectures and weights, black-box analysis becomes\ncrucial for evaluating their capabilities. We discuss the influence of training\ndata distribution and the implications on the models' generalizability. This\nanalysis serves as a foundation for future research, aiming to optimize\nGPT-based models and enable their successful adoption in legal information\nextraction and entailment applications.\n","authors":["Ha-Thanh Nguyen","Randy Goebel","Francesca Toni","Kostas Stathis","Ken Satoh"],"pdf_url":"https://arxiv.org/pdf/2309.05501v1.pdf","comment":"ISAILD@KSE 2023"},{"id":"http://arxiv.org/abs/2309.05500v1","updated":"2023-09-11T14:43:45Z","published":"2023-09-11T14:43:45Z","title":"NeCo@ALQAC 2023: Legal Domain Knowledge Acquisition for Low-Resource\n Languages through Data Enrichment","summary":" In recent years, natural language processing has gained significant\npopularity in various sectors, including the legal domain. This paper presents\nNeCo Team's solutions to the Vietnamese text processing tasks provided in the\nAutomated Legal Question Answering Competition 2023 (ALQAC 2023), focusing on\nlegal domain knowledge acquisition for low-resource languages through data\nenrichment. Our methods for the legal document retrieval task employ a\ncombination of similarity ranking and deep learning models, while for the\nsecond task, which requires extracting an answer from a relevant legal article\nin response to a question, we propose a range of adaptive techniques to handle\ndifferent question types. Our approaches achieve outstanding results on both\ntasks of the competition, demonstrating the potential benefits and\neffectiveness of question answering systems in the legal field, particularly\nfor low-resource languages.\n","authors":["Hai-Long Nguyen","Dieu-Quynh Nguyen","Hoang-Trung Nguyen","Thu-Trang Pham","Huu-Dong Nguyen","Thach-Anh Nguyen","Thi-Hai-Yen Vuong","Ha-Thanh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2309.05500v1.pdf","comment":"ISAILD@KSE 2023"},{"id":"http://arxiv.org/abs/2309.05497v1","updated":"2023-09-11T14:39:04Z","published":"2023-09-11T14:39:04Z","title":"Personality Detection and Analysis using Twitter Data","summary":" Personality types are important in various fields as they hold relevant\ninformation about the characteristics of a human being in an explainable\nformat. They are often good predictors of a person's behaviors in a particular\nenvironment and have applications ranging from candidate selection to marketing\nand mental health. Recently automatic detection of personality traits from\ntexts has gained significant attention in computational linguistics. Most\npersonality detection and analysis methods have focused on small datasets\nmaking their experimental observations often limited. To bridge this gap, we\nfocus on collecting and releasing the largest automatically curated dataset for\nthe research community which has 152 million tweets and 56 thousand data points\nfor the Myers-Briggs personality type (MBTI) prediction task. We perform a\nseries of extensive qualitative and quantitative studies on our dataset to\nanalyze the data patterns in a better way and infer conclusions. We show how\nour intriguing analysis results often follow natural intuition. We also perform\na series of ablation studies to show how the baselines perform for our dataset.\n","authors":["Abhilash Datta","Souvic Chakraborty","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2309.05497v1.pdf","comment":"Submitted to ASONAM 2023"},{"id":"http://arxiv.org/abs/2309.05494v1","updated":"2023-09-11T14:36:16Z","published":"2023-09-11T14:36:16Z","title":"CrisisTransformers: Pre-trained language models and sentence encoders\n for crisis-related social media texts","summary":" Social media platforms play an essential role in crisis communication, but\nanalyzing crisis-related social media texts is challenging due to their\ninformal nature. Transformer-based pre-trained models like BERT and RoBERTa\nhave shown success in various NLP tasks, but they are not tailored for\ncrisis-related texts. Furthermore, general-purpose sentence encoders are used\nto generate sentence embeddings, regardless of the textual complexities in\ncrisis-related texts. Advances in applications like text classification,\nsemantic search, and clustering contribute to effective processing of\ncrisis-related texts, which is essential for emergency responders to gain a\ncomprehensive view of a crisis event, whether historical or real-time. To\naddress these gaps in crisis informatics literature, this study introduces\nCrisisTransformers, an ensemble of pre-trained language models and sentence\nencoders trained on an extensive corpus of over 15 billion word tokens from\ntweets associated with more than 30 crisis events, including disease outbreaks,\nnatural disasters, conflicts, and other critical incidents. We evaluate\nexisting models and CrisisTransformers on 18 crisis-specific public datasets.\nOur pre-trained models outperform strong baselines across all datasets in\nclassification tasks, and our best-performing sentence encoder improves the\nstate-of-the-art by 17.43% in sentence encoding tasks. Additionally, we\ninvestigate the impact of model initialization on convergence and evaluate the\nsignificance of domain-specific models in generating semantically meaningful\nsentence embeddings. All models are publicly released\n(https://huggingface.co/crisistransformers), with the anticipation that they\nwill serve as a robust baseline for tasks involving the analysis of\ncrisis-related social media texts.\n","authors":["Rabindra Lamsal","Maria Rodriguez Read","Shanika Karunasekera"],"pdf_url":"https://arxiv.org/pdf/2309.05494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05475v1","updated":"2023-09-11T14:16:27Z","published":"2023-09-11T14:16:27Z","title":"Zero-shot Learning with Minimum Instruction to Extract Social\n Determinants and Family History from Clinical Notes using GPT Model","summary":" Demographics, Social determinants of health, and family history documented in\nthe unstructured text within the electronic health records are increasingly\nbeing studied to understand how this information can be utilized with the\nstructured data to improve healthcare outcomes. After the GPT models were\nreleased, many studies have applied GPT models to extract this information from\nthe narrative clinical notes. Different from the existing work, our research\nfocuses on investigating the zero-shot learning on extracting this information\ntogether by providing minimum information to the GPT model. We utilize\nde-identified real-world clinical notes annotated for demographics, various\nsocial determinants, and family history information. Given that the GPT model\nmight provide text different from the text in the original data, we explore two\nsets of evaluation metrics, including the traditional NER evaluation metrics\nand semantic similarity evaluation metrics, to completely understand the\nperformance. Our results show that the GPT-3.5 method achieved an average of\n0.975 F1 on demographics extraction, 0.615 F1 on social determinants\nextraction, and 0.722 F1 on family history extraction. We believe these results\ncan be further improved through model fine-tuning or few-shots learning.\nThrough the case studies, we also identified the limitations of the GPT models,\nwhich need to be addressed in future research.\n","authors":["Neel Jitesh Bhate","Ansh Mittal","Zhe He","Xiao Luo"],"pdf_url":"https://arxiv.org/pdf/2309.05475v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.05472v1","updated":"2023-09-11T14:13:09Z","published":"2023-09-11T14:13:09Z","title":"LeBenchmark 2.0: a Standardized, Replicable and Enhanced Framework for\n Self-supervised Representations of French Speech","summary":" Self-supervised learning (SSL) is at the origin of unprecedented improvements\nin many different domains including computer vision and natural language\nprocessing. Speech processing drastically benefitted from SSL as most of the\ncurrent domain-related tasks are now being approached with pre-trained models.\nThis work introduces LeBenchmark 2.0 an open-source framework for assessing and\nbuilding SSL-equipped French speech technologies. It includes documented,\nlarge-scale and heterogeneous corpora with up to 14,000 hours of heterogeneous\nspeech, ten pre-trained SSL wav2vec 2.0 models containing from 26 million to\none billion learnable parameters shared with the community, and an evaluation\nprotocol made of six downstream tasks to complement existing benchmarks.\nLeBenchmark 2.0 also presents unique perspectives on pre-trained SSL models for\nspeech with the investigation of frozen versus fine-tuned downstream models,\ntask-agnostic versus task-specific pre-trained models as well as a discussion\non the carbon footprint of large-scale model training.\n","authors":["Titouan Parcollet","Ha Nguyen","Solene Evain","Marcely Zanon Boito","Adrien Pupier","Salima Mdhaffar","Hang Le","Sina Alisamir","Natalia Tomashenko","Marco Dinarelli","Shucong Zhang","Alexandre Allauzen","Maximin Coavoux","Yannick Esteve","Mickael Rouvier","Jerome Goulian","Benjamin Lecouteux","Francois Portet","Solange Rossato","Fabien Ringeval","Didier Schwab","Laurent Besacier"],"pdf_url":"https://arxiv.org/pdf/2309.05472v1.pdf","comment":"Under submission at Computer Science and Language. Preprint allowed"},{"id":"http://arxiv.org/abs/2307.15453v2","updated":"2023-09-11T14:06:27Z","published":"2023-07-28T10:11:01Z","title":"From Probabilistic Programming to Complexity-based Programming","summary":" The paper presents the main characteristics and a preliminary implementation\nof a novel computational framework named CompLog. Inspired by probabilistic\nprogramming systems like ProbLog, CompLog builds upon the inferential\nmechanisms proposed by Simplicity Theory, relying on the computation of two\nKolmogorov complexities (here implemented as min-path searches via ASP\nprograms) rather than probabilistic inference. The proposed system enables\nusers to compute ex-post and ex-ante measures of unexpectedness of a certain\nsituation, mapping respectively to posterior and prior subjective\nprobabilities. The computation is based on the specification of world and\nmental models by means of causal and descriptive relations between predicates\nweighted by complexity. The paper illustrates a few examples of application:\ngenerating relevant descriptions, and providing alternative approaches to\ndisjunction and to negation.\n","authors":["Giovanni Sileno","Jean-Louis Dessalles"],"pdf_url":"https://arxiv.org/pdf/2307.15453v2.pdf","comment":"paper accepted at HYDRA workshop at ECAI 2023"},{"id":"http://arxiv.org/abs/2309.05463v1","updated":"2023-09-11T14:01:45Z","published":"2023-09-11T14:01:45Z","title":"Textbooks Are All You Need II: phi-1.5 technical report","summary":" We continue the investigation into the power of smaller Transformer-based\nlanguage models as initiated by \\textbf{TinyStories} -- a 10 million parameter\nmodel that can produce coherent English -- and the follow-up work on\n\\textbf{phi-1}, a 1.3 billion parameter model with Python coding performance\nclose to the state-of-the-art. The latter work proposed to use existing Large\nLanguage Models (LLMs) to generate ``textbook quality\" data as a way to enhance\nthe learning process compared to traditional web data. We follow the\n``Textbooks Are All You Need\" approach, focusing this time on common sense\nreasoning in natural language, and create a new 1.3 billion parameter model\nnamed \\textbf{phi-1.5}, with performance on natural language tasks comparable\nto models 5x larger, and surpassing most non-frontier LLMs on more complex\nreasoning tasks such as grade-school mathematics and basic coding. More\ngenerally, \\textbf{phi-1.5} exhibits many of the traits of much larger LLMs,\nboth good -- such as the ability to ``think step by step\" or perform some\nrudimentary in-context learning -- and bad, including hallucinations and the\npotential for toxic and biased generations -- encouragingly though, we are\nseeing improvement on that front thanks to the absence of web data. We\nopen-source \\textbf{phi-1.5} to promote further research on these urgent\ntopics.\n","authors":["Yuanzhi Li","Sébastien Bubeck","Ronen Eldan","Allie Del Giorno","Suriya Gunasekar","Yin Tat Lee"],"pdf_url":"https://arxiv.org/pdf/2309.05463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05454v1","updated":"2023-09-11T13:50:38Z","published":"2023-09-11T13:50:38Z","title":"Flesch or Fumble? Evaluating Readability Standard Alignment of\n Instruction-Tuned Language Models","summary":" Readability metrics and standards such as Flesch Kincaid Grade Level (FKGL)\nand the Common European Framework of Reference for Languages (CEFR) exist to\nguide teachers and educators to properly assess the complexity of educational\nmaterials before administering them for classroom use. In this study, we select\na diverse set of open and closed-source instruction-tuned language models and\ninvestigate their performances in writing story completions and simplifying\nnarratives$-$tasks that teachers perform$-$using standard-guided prompts\ncontrolling text readability. Our extensive findings provide empirical proof of\nhow globally recognized models like ChatGPT may be considered less effective\nand may require more refined prompts for these generative tasks compared to\nother open-sourced models such as BLOOMZ and FlanT5$-$which have shown\npromising results.\n","authors":["Joseph Marvin Imperial","Harish Tayyar Madabushi"],"pdf_url":"https://arxiv.org/pdf/2309.05454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05452v1","updated":"2023-09-11T13:47:07Z","published":"2023-09-11T13:47:07Z","title":"Evaluating the Deductive Competence of Large Language Models","summary":" The development of highly fluent large language models (LLMs) has prompted\nincreased interest in assessing their reasoning and problem-solving\ncapabilities. We investigate whether several LLMs can solve a classic type of\ndeductive reasoning problem from the cognitive science literature. The tested\nLLMs have limited abilities to solve these problems in their conventional form.\nWe performed follow up experiments to investigate if changes to the\npresentation format and content improve model performance. We do find\nperformance differences between conditions; however, they do not improve\noverall performance. Moreover, we find that performance interacts with\npresentation format and content in unexpected ways that differ from human\nperformance. Overall, our results suggest that LLMs have unique reasoning\nbiases that are only partially predicted from human reasoning performance.\n","authors":["S. M. Seals","Valerie L. Shalin"],"pdf_url":"https://arxiv.org/pdf/2309.05452v1.pdf","comment":"7 pages, 7 figures, under review"},{"id":"http://arxiv.org/abs/2309.05448v1","updated":"2023-09-11T13:41:27Z","published":"2023-09-11T13:41:27Z","title":"Panoptic Vision-Language Feature Fields","summary":" Recently, methods have been proposed for 3D open-vocabulary semantic\nsegmentation. Such methods are able to segment scenes into arbitrary classes\ngiven at run-time using their text description. In this paper, we propose to\nour knowledge the first algorithm for open-vocabulary panoptic segmentation,\nsimultaneously performing both semantic and instance segmentation. Our\nalgorithm, Panoptic Vision-Language Feature Fields (PVLFF) learns a feature\nfield of the scene, jointly learning vision-language features and hierarchical\ninstance features through a contrastive loss function from 2D instance segment\nproposals on input frames. Our method achieves comparable performance against\nthe state-of-the-art close-set 3D panoptic systems on the HyperSim, ScanNet and\nReplica dataset and outperforms current 3D open-vocabulary systems in terms of\nsemantic segmentation. We additionally ablate our method to demonstrate the\neffectiveness of our model architecture. Our code will be available at\nhttps://github.com/ethz-asl/autolabel.\n","authors":["Haoran Chen","Kenneth Blomqvist","Francesco Milano","Roland Siegwart"],"pdf_url":"https://arxiv.org/pdf/2309.05448v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.05447v1","updated":"2023-09-11T13:41:18Z","published":"2023-09-11T13:41:18Z","title":"TeGit: Generating High-Quality Instruction-Tuning Data with\n Text-Grounded Task Design","summary":" High-quality instruction-tuning data is critical to improving LLM\ncapabilities. Existing data collection methods are limited by unrealistic\nmanual labeling costs or by the hallucination of relying solely on LLM\ngeneration. To address the problems, this paper presents a scalable method to\nautomatically collect high-quality instructional adaptation data by training\nlanguage models to automatically design tasks based on human-written texts.\nIntuitively, human-written text helps to help the model attenuate illusions\nduring the generation of tasks. Unlike instruction back-translation-based\nmethods that directly take the given text as a response, we require the model\nto generate the \\textit{instruction}, \\textit{input}, and \\textit{output}\nsimultaneously to filter the noise. The results of the automated and manual\nevaluation experiments demonstrate the quality of our dataset.\n","authors":["Yongrui Chen","Haiyun Jiang","Xinting Huang","Shuming Shi","Guilin Qi"],"pdf_url":"https://arxiv.org/pdf/2309.05447v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.05444v1","updated":"2023-09-11T13:31:00Z","published":"2023-09-11T13:31:00Z","title":"Pushing Mixture of Experts to the Limit: Extremely Parameter Efficient\n MoE for Instruction Tuning","summary":" The Mixture of Experts (MoE) is a widely known neural architecture where an\nensemble of specialized sub-models optimizes overall performance with a\nconstant computational cost. However, conventional MoEs pose challenges at\nscale due to the need to store all experts in memory. In this paper, we push\nMoE to the limit. We propose extremely parameter-efficient MoE by uniquely\ncombining MoE architecture with lightweight experts.Our MoE architecture\noutperforms standard parameter-efficient fine-tuning (PEFT) methods and is on\npar with full fine-tuning by only updating the lightweight experts -- less than\n1% of an 11B parameters model. Furthermore, our method generalizes to unseen\ntasks as it does not depend on any prior task knowledge. Our research\nunderscores the versatility of the mixture of experts architecture, showcasing\nits ability to deliver robust performance even when subjected to rigorous\nparameter constraints. Our code used in all the experiments is publicly\navailable here: https://github.com/for-ai/parameter-efficient-moe.\n","authors":["Ted Zadouri","Ahmet Üstün","Arash Ahmadian","Beyza Ermiş","Acyr Locatelli","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2309.05444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05429v1","updated":"2023-09-11T13:05:23Z","published":"2023-09-11T13:05:23Z","title":"Improving Information Extraction on Business Documents with Specific\n Pre-Training Tasks","summary":" Transformer-based Language Models are widely used in Natural Language\nProcessing related tasks. Thanks to their pre-training, they have been\nsuccessfully adapted to Information Extraction in business documents. However,\nmost pre-training tasks proposed in the literature for business documents are\ntoo generic and not sufficient to learn more complex structures. In this paper,\nwe use LayoutLM, a language model pre-trained on a collection of business\ndocuments, and introduce two new pre-training tasks that further improve its\ncapacity to extract relevant information. The first is aimed at better\nunderstanding the complex layout of documents, and the second focuses on\nnumeric values and their order of magnitude. These tasks force the model to\nlearn better-contextualized representations of the scanned documents. We\nfurther introduce a new post-processing algorithm to decode BIESO tags in\nInformation Extraction that performs better with complex entities. Our method\nsignificantly improves extraction performance on both public (from 93.88 to\n95.50 F1 score) and private (from 84.35 to 84.84 F1 score) datasets composed of\nexpense receipts, invoices, and purchase orders.\n","authors":["Thibault Douzon","Stefan Duffner","Christophe Garcia","Jérémy Espinas"],"pdf_url":"https://arxiv.org/pdf/2309.05429v1.pdf","comment":"Conference: Document Analysis Systems. DAS 2022"},{"id":"http://arxiv.org/abs/2309.05423v1","updated":"2023-09-11T12:50:28Z","published":"2023-09-11T12:50:28Z","title":"Multi-Modal Automatic Prosody Annotation with Contrastive Pretraining of\n SSWP","summary":" In the realm of expressive Text-to-Speech (TTS), explicit prosodic boundaries\nsignificantly advance the naturalness and controllability of synthesized\nspeech. While human prosody annotation contributes a lot to the performance, it\nis a labor-intensive and time-consuming process, often resulting in\ninconsistent outcomes. Despite the availability of extensive supervised data,\nthe current benchmark model still faces performance setbacks. To address this\nissue, a two-stage automatic annotation pipeline is novelly proposed in this\npaper. Specifically, in the first stage, we propose contrastive text-speech\npretraining of Speech-Silence and Word-Punctuation (SSWP) pairs. The\npretraining procedure hammers at enhancing the prosodic space extracted from\njoint text-speech space. In the second stage, we build a multi-modal prosody\nannotator, which consists of pretrained encoders, a straightforward yet\neffective text-speech feature fusion scheme, and a sequence classifier.\nExtensive experiments conclusively demonstrate that our proposed method excels\nat automatically generating prosody annotation and achieves state-of-the-art\n(SOTA) performance. Furthermore, our novel model has exhibited remarkable\nresilience when tested with varying amounts of data.\n","authors":["Jinzuomu Zhong","Yang Li","Hui Huang","Jie Liu","Zhiba Su","Jing Guo","Benlai Tang","Fengjie Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.05423v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2304.04321v2","updated":"2023-09-11T11:27:53Z","published":"2023-04-09T21:42:57Z","title":"ARNOLD: A Benchmark for Language-Grounded Task Learning With Continuous\n States in Realistic 3D Scenes","summary":" Understanding the continuous states of objects is essential for task learning\nand planning in the real world. However, most existing task learning benchmarks\nassume discrete (e.g., binary) object goal states, which poses challenges for\nthe learning of complex tasks and transferring learned policy from simulated\nenvironments to the real world. Furthermore, state discretization limits a\nrobot's ability to follow human instructions based on the grounding of actions\nand states. To tackle these challenges, we present ARNOLD, a benchmark that\nevaluates language-grounded task learning with continuous states in realistic\n3D scenes. ARNOLD is comprised of 8 language-conditioned tasks that involve\nunderstanding object states and learning policies for continuous goals. To\npromote language-instructed learning, we provide expert demonstrations with\ntemplate-generated language descriptions. We assess task performance by\nutilizing the latest language-conditioned policy learning models. Our results\nindicate that current models for language-conditioned manipulations continue to\nexperience significant challenges in novel goal-state generalizations, scene\ngeneralizations, and object generalizations. These findings highlight the need\nto develop new algorithms that address this gap and underscore the potential\nfor further research in this area. Project website:\nhttps://arnold-benchmark.github.io.\n","authors":["Ran Gong","Jiangyong Huang","Yizhou Zhao","Haoran Geng","Xiaofeng Gao","Qingyang Wu","Wensi Ai","Ziheng Zhou","Demetri Terzopoulos","Song-Chun Zhu","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2304.04321v2.pdf","comment":"The first two authors contributed equally; 20 pages; 17 figures;\n project availalbe: https://arnold-benchmark.github.io/ ICCV 2023"},{"id":"http://arxiv.org/abs/2305.08455v3","updated":"2023-09-11T10:36:41Z","published":"2023-05-15T08:54:32Z","title":"Document Understanding Dataset and Evaluation (DUDE)","summary":" We call on the Document AI (DocAI) community to reevaluate current\nmethodologies and embrace the challenge of creating more practically-oriented\nbenchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to\nremediate the halted research progress in understanding visually-rich documents\n(VRDs). We present a new dataset with novelties related to types of questions,\nanswers, and document layouts based on multi-industry, multi-domain, and\nmulti-page VRDs of various origins, and dates. Moreover, we are pushing the\nboundaries of current methods by creating multi-task and multi-domain\nevaluation setups that more accurately simulate real-world situations where\npowerful generalization and adaptation under low-resource settings are desired.\nDUDE aims to set a new standard as a more practical, long-standing benchmark\nfor the community, and we hope that it will lead to future extensions and\ncontributions that address real-world challenges. Finally, our work illustrates\nthe importance of finding more efficient ways to model language, images, and\nlayout in DocAI.\n","authors":["Jordy Van Landeghem","Rubén Tito","Łukasz Borchmann","Michał Pietruszka","Paweł Józiak","Rafał Powalski","Dawid Jurkiewicz","Mickaël Coustaty","Bertrand Ackaert","Ernest Valveny","Matthew Blaschko","Sien Moens","Tomasz Stanisławek"],"pdf_url":"https://arxiv.org/pdf/2305.08455v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.07940v2","updated":"2023-09-11T09:42:37Z","published":"2023-07-16T04:20:26Z","title":"Deduplicating and Ranking Solution Programs for Suggesting Reference\n Solutions","summary":" Referring to solution programs written by other users is helpful for learners\nin programming education. However, current online judge systems just list all\nsolution programs submitted by users for references, and the programs are\nsorted based on the submission date and time, execution time, or user rating,\nignoring to what extent the programs can be helpful to be referenced. In\naddition, users struggle to refer to a variety of solution approaches since\nthere are too many duplicated and near-duplicated programs. To motivate\nlearners to refer to various solutions to learn better solution approaches, in\nthis paper, we propose an approach to deduplicate and rank common solution\nprograms in each programming problem. Inspired by the nature that the\nmany-duplicated program adopts a more common approach and can be a general\nreference, we remove the near-duplicated solution programs and rank the unique\nprograms based on the duplicate count. The experiments on the solution programs\nsubmitted to a real-world online judge system demonstrate that the number of\nprograms is reduced by 60.20%, whereas the baseline only reduces by 29.59%\nafter the deduplication, meaning that users only need to refer to 39.80% of\nprograms on average. Furthermore, our analysis shows that top-10 ranked\nprograms cover 29.95% of programs on average, indicating that users can grasp\n29.95% of solution approaches by referring to only 10 programs. The proposed\napproach shows the potential of reducing the learners' burden of referring to\ntoo many solutions and motivating them to learn a variety of solution\napproaches.\n","authors":["Atsushi Shirafuji","Yutaka Watanobe"],"pdf_url":"https://arxiv.org/pdf/2307.07940v2.pdf","comment":"7 pages, 5 figures, accepted to ASSE 2023"},{"id":"http://arxiv.org/abs/2309.02780v2","updated":"2023-09-11T09:35:14Z","published":"2023-09-06T06:44:26Z","title":"GRASS: Unified Generation Model for Speech-to-Semantic Tasks","summary":" This paper explores the instruction fine-tuning technique for\nspeech-to-semantic tasks by introducing a unified end-to-end (E2E) framework\nthat generates target text conditioned on a task-related prompt for audio data.\nWe pre-train the model using large and diverse data, where instruction-speech\npairs are constructed via a text-to-speech (TTS) system. Extensive experiments\ndemonstrate that our proposed model achieves state-of-the-art (SOTA) results on\nmany benchmarks covering speech named entity recognition, speech sentiment\nanalysis, speech question answering, and more, after fine-tuning. Furthermore,\nthe proposed model achieves competitive performance in zero-shot and few-shot\nscenarios. To facilitate future work on instruction fine-tuning for\nspeech-to-semantic tasks, we release our instruction dataset and code.\n","authors":["Aobo Xia","Shuyu Lei","Yushu Yang","Xiang Guo","Hua Chai"],"pdf_url":"https://arxiv.org/pdf/2309.02780v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05312v1","updated":"2023-09-11T08:57:02Z","published":"2023-09-11T08:57:02Z","title":"Experimenting with UD Adaptation of an Unsupervised Rule-based Approach\n for Sentiment Analysis of Mexican Tourist Texts","summary":" This paper summarizes the results of experimenting with Universal\nDependencies (UD) adaptation of an Unsupervised, Compositional and Recursive\n(UCR) rule-based approach for Sentiment Analysis (SA) submitted to the Shared\nTask at Rest-Mex 2023 (Team Olga/LyS-SALSA) (within the IberLEF 2023\nconference). By using basic syntactic rules such as rules of modification and\nnegation applied on words from sentiment dictionaries, our approach exploits\nsome advantages of an unsupervised method for SA: (1) interpretability and\nexplainability of SA, (2) robustness across datasets, languages and domains and\n(3) usability by non-experts in NLP. We compare our approach with other\nunsupervised approaches of SA that in contrast to our UCR rule-based approach\nuse simple heuristic rules to deal with negation and modification. Our results\nshow a considerable improvement over these approaches. We discuss future\nimprovements of our results by using modality features as another shifting rule\nof polarity and word disambiguation techniques to identify the right sentiment\nwords.\n","authors":["Olga Kellert","Mahmud Uz Zaman","Nicholas Hill Matlis","Carlos Gómez-Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2309.05312v1.pdf","comment":"Proceedings of IberLEF 2023, Ja\\'en, Spain, 2023"},{"id":"http://arxiv.org/abs/2309.05311v1","updated":"2023-09-11T08:56:47Z","published":"2023-09-11T08:56:47Z","title":"Analysing Cross-Lingual Transfer in Low-Resourced African Named Entity\n Recognition","summary":" Transfer learning has led to large gains in performance for nearly all NLP\ntasks while making downstream models easier and faster to train. This has also\nbeen extended to low-resourced languages, with some success. We investigate the\nproperties of cross-lingual transfer learning between ten low-resourced\nlanguages, from the perspective of a named entity recognition task. We\nspecifically investigate how much adaptive fine-tuning and the choice of\ntransfer language affect zero-shot transfer performance. We find that models\nthat perform well on a single language often do so at the expense of\ngeneralising to others, while models with the best generalisation to other\nlanguages suffer in individual language performance. Furthermore, the amount of\ndata overlap between the source and target datasets is a better predictor of\ntransfer performance than either the geographical or genetic distance between\nthe languages.\n","authors":["Michael Beukman","Manuel Fokam"],"pdf_url":"https://arxiv.org/pdf/2309.05311v1.pdf","comment":"Accepted to IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2308.13566v2","updated":"2023-09-11T08:28:40Z","published":"2023-08-25T01:41:04Z","title":"MLLM-DataEngine: An Iterative Refinement Approach for MLLM","summary":" Despite the great advance of Multimodal Large Language Models (MLLMs) in both\ninstruction dataset building and benchmarking, the independence of training and\nevaluation makes current MLLMs hard to further improve their capability under\nthe guidance of evaluation results with a relatively low human cost. In this\npaper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data\ngeneration, model training, and evaluation. Within each loop iteration, the\nMLLM-DataEngine first analyze the weakness of the model based on the evaluation\nresults, then generate a proper incremental dataset for the next training\niteration and enhance the model capability iteratively. Compared with previous\ndata collection methods which are separate from the benchmarking, the data\ngenerated by MLLM-DataEngine shows better targeting, quality, and correctness.\nFor targeting, we propose an Adaptive Bad-case Sampling module, which adjusts\nthe ratio of different types of data within each incremental dataset based on\nthe benchmarking results. For quality, we resort to GPT-4 to generate\nhigh-quality data with each given data type. For correctness, prompt design is\ncritical for the data generation results. Rather than previous hand-crafted\nprompt, we propose an Interactive Prompt Optimization strategy, which optimizes\nthe prompt with the multi-round interaction between human and GPT, and improve\nthe correctness of generated data greatly. Through extensive experiments, we\nfind our MLLM-DataEngine could boost the MLLM capability in a targeted and\nautomatic manner, with only a few human participation. We hope it could be a\ngeneral solution for the following MLLMs building. The MLLM-DataEngine has been\nopen-sourced and is now available at\nhttps://github.com/opendatalab/MLLM-DataEngine.\n","authors":["Zhiyuan Zhao","Linke Ouyang","Bin Wang","Siyuan Huang","Pan Zhang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2308.13566v2.pdf","comment":"Code and models are available at\n https://github.com/opendatalab/MLLM-DataEngine"},{"id":"http://arxiv.org/abs/2307.11788v3","updated":"2023-09-11T07:21:10Z","published":"2023-07-20T18:30:35Z","title":"Applying QNLP to sentiment analysis in finance","summary":" As an application domain where the slightest qualitative improvements can\nyield immense value, finance is a promising candidate for early quantum\nadvantage. Focusing on the rapidly advancing field of Quantum Natural Language\nProcessing (QNLP), we explore the practical applicability of the two central\napproaches DisCoCat and Quantum-Enhanced Long Short-Term Memory (QLSTM) to the\nproblem of sentiment analysis in finance. Utilizing a novel ChatGPT-based data\ngeneration approach, we conduct a case study with more than 1000 realistic\nsentences and find that QLSTMs can be trained substantially faster than\nDisCoCat while also achieving close to classical results for their available\nsoftware implementations.\n","authors":["Jonas Stein","Ivo Christ","Nicolas Kraus","Maximilian Balthasar Mansky","Robert Müller","Claudia Linnhoff-Popien"],"pdf_url":"https://arxiv.org/pdf/2307.11788v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05272v1","updated":"2023-09-11T07:10:47Z","published":"2023-09-11T07:10:47Z","title":"Minuteman: Machine and Human Joining Forces in Meeting Summarization","summary":" Many meetings require creating a meeting summary to keep everyone up to date.\nCreating minutes of sufficient quality is however very cognitively demanding.\nAlthough we currently possess capable models for both audio speech recognition\n(ASR) and summarization, their fully automatic use is still problematic. ASR\nmodels frequently commit errors when transcribing named entities while the\nsummarization models tend to hallucinate and misinterpret the transcript. We\npropose a novel tool -- Minuteman -- to enable efficient semi-automatic meeting\nminuting. The tool provides a live transcript and a live meeting summary to the\nusers, who can edit them in a collaborative manner, enabling correction of ASR\nerrors and imperfect summary points in real time. The resulting application\neases the cognitive load of the notetakers and allows them to easily catch up\nif they missed a part of the meeting due to absence or a lack of focus. We\nconduct several tests of the application in varied settings, exploring the\nworthiness of the concept and the possible user strategies.\n","authors":["František Kmječ","Ondřej Bojar"],"pdf_url":"https://arxiv.org/pdf/2309.05272v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.05270v1","updated":"2023-09-11T07:02:13Z","published":"2023-09-11T07:02:13Z","title":"CONFLATOR: Incorporating Switching Point based Rotatory Positional\n Encodings for Code-Mixed Language Modeling","summary":" The mixing of two or more languages is called Code-Mixing (CM). CM is a\nsocial norm in multilingual societies. Neural Language Models (NLMs) like\ntransformers have been very effective on many NLP tasks. However, NLM for CM is\nan under-explored area. Though transformers are capable and powerful, they\ncannot always encode positional/sequential information since they are\nnon-recurrent. Therefore, to enrich word information and incorporate positional\ninformation, positional encoding is defined. We hypothesize that Switching\nPoints (SPs), i.e., junctions in the text where the language switches (L1 -> L2\nor L2-> L1), pose a challenge for CM Language Models (LMs), and hence give\nspecial emphasis to switching points in the modeling process. We experiment\nwith several positional encoding mechanisms and show that rotatory positional\nencodings along with switching point information yield the best results.\n We introduce CONFLATOR: a neural language modeling approach for code-mixed\nlanguages. CONFLATOR tries to learn to emphasize switching points using smarter\npositional encoding, both at unigram and bigram levels. CONFLATOR outperforms\nthe state-of-the-art on two tasks based on code-mixed Hindi and English\n(Hinglish): (i) sentiment analysis and (ii) machine translation.\n","authors":["Mohsin Ali","Kandukuri Sai Teja","Neeharika Gupta","Parth Patwa","Anubhab Chatterjee","Vinija Jain","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2309.05270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14192v3","updated":"2023-09-11T06:47:27Z","published":"2023-06-25T10:16:49Z","title":"$α$-$β$-Factorization and the Binary Case of Simon's Congruence","summary":" In 1991 H\\'ebrard introduced a factorization of words that turned out to be a\npowerful tool for the investigation of a word's scattered factors (also known\nas (scattered) subwords or subsequences). Based on this, first Karandikar and\nSchnoebelen introduced the notion of $k$-richness and later on Barker et al.\nthe notion of $k$-universality. In 2022 Fleischmann et al. presented a\ngeneralization of the arch factorization by intersecting the arch factorization\nof a word and its reverse. While the authors merely used this factorization for\nthe investigation of shortest absent scattered factors, in this work we\ninvestigate this new $\\alpha$-$\\beta$-factorization as such. We characterize\nthe famous Simon congruence of $k$-universal words in terms of $1$-universal\nwords. Moreover, we apply these results to binary words. In this special case,\nwe obtain a full characterization of the classes and calculate the index of the\ncongruence. Lastly, we start investigating the ternary case, present a full\nlist of possibilities for $\\alpha\\beta\\alpha$-factors, and characterize their\ncongruence.\n","authors":["Pamela Fleischmann","Jonas Höfer","Annika Huch","Dirk Nowotka"],"pdf_url":"https://arxiv.org/pdf/2306.14192v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04087v5","updated":"2023-09-11T06:27:53Z","published":"2023-05-06T16:12:19Z","title":"Self-Edit: Fault-Aware Code Editor for Code Generation","summary":" Large language models (LLMs) have demonstrated an impressive ability to\ngenerate codes on competitive programming tasks. However, with limited sample\nnumbers, LLMs still suffer from poor accuracy. Inspired by the process of human\nprogramming, we propose a generate-and-edit approach named Self-Edit that\nutilizes execution results of the generated code from LLMs to improve the code\nquality on the competitive programming task. We execute the generated code on\nthe example test case provided in the question and wrap execution results into\na supplementary comment. Utilizing this comment as guidance, our fault-aware\ncode editor is employed to correct errors in the generated code. We perform\nextensive evaluations across two competitive programming datasets with nine\ndifferent LLMs. Compared to directly generating from LLMs, our approach can\nimprove the average of pass@1 by 89\\% on APPS-dev, 31\\% on APPS-test, and 48\\%\non HumanEval over nine popular code generation LLMs with parameter sizes\nranging from 110M to 175B. Compared to other post-processing methods, our\nmethod demonstrates superior accuracy and efficiency.\n","authors":["Kechi Zhang","Zhuo Li","Jia Li","Ge Li","Zhi Jin"],"pdf_url":"https://arxiv.org/pdf/2305.04087v5.pdf","comment":"Accepted by ACL2023"},{"id":"http://arxiv.org/abs/2308.13032v2","updated":"2023-09-11T05:39:05Z","published":"2023-08-24T18:58:10Z","title":"Financial News Analytics Using Fine-Tuned Llama 2 GPT Model","summary":" The paper considers the possibility to fine-tune Llama 2 GPT large language\nmodel (LLM) for the multitask analysis of financial news. For fine-tuning, the\nPEFT/LoRA based approach was used. In the study, the model was fine-tuned for\nthe following tasks: analysing a text from financial market perspectives,\nhighlighting main points of a text, summarizing a text and extracting named\nentities with appropriate sentiments. The obtained results show that the\nfine-tuned Llama 2 model can perform a multitask financial news analysis with a\nspecified structure of response, part of response can be a structured text and\nanother part of data can have JSON format for further processing. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models with quantitative target variables.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2308.13032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04389v2","updated":"2023-09-11T05:19:16Z","published":"2023-09-08T15:40:54Z","title":"CSPRD: A Financial Policy Retrieval Dataset for Chinese Stock Market","summary":" In recent years, great advances in pre-trained language models (PLMs) have\nsparked considerable research focus and achieved promising performance on the\napproach of dense passage retrieval, which aims at retrieving relative passages\nfrom massive corpus with given questions. However, most of existing datasets\nmainly benchmark the models with factoid queries of general commonsense, while\nspecialised fields such as finance and economics remain unexplored due to the\ndeficiency of large-scale and high-quality datasets with expert annotations. In\nthis work, we propose a new task, policy retrieval, by introducing the Chinese\nStock Policy Retrieval Dataset (CSPRD), which provides 700+ prospectus passages\nlabeled by experienced experts with relevant articles from 10k+ entries in our\ncollected Chinese policy corpus. Experiments on lexical, embedding and\nfine-tuned bi-encoder models show the effectiveness of our proposed CSPRD yet\nalso suggests ample potential for improvement. Our best performing baseline\nachieves 56.1% MRR@10, 28.5% NDCG@10, 37.5% Recall@10 and 80.6% Precision@10 on\ndev set.\n","authors":["Jinyuan Wang","Hai Zhao","Zhong Wang","Zeyang Zhu","Jinhao Xie","Yong Yu","Yongjian Fei","Yue Huang","Dawei Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.04389v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05227v1","updated":"2023-09-11T04:20:36Z","published":"2023-09-11T04:20:36Z","title":"Detecting Natural Language Biases with Prompt-based Learning","summary":" In this project, we want to explore the newly emerging field of prompt\nengineering and apply it to the downstream task of detecting LM biases. More\nconcretely, we explore how to design prompts that can indicate 4 different\ntypes of biases: (1) gender, (2) race, (3) sexual orientation, and (4)\nreligion-based. Within our project, we experiment with different manually\ncrafted prompts that can draw out the subtle biases that may be present in the\nlanguage model. We apply these prompts to multiple variations of popular and\nwell-recognized models: BERT, RoBERTa, and T5 to evaluate their biases. We\nprovide a comparative analysis of these models and assess them using a two-fold\nmethod: use human judgment to decide whether model predictions are biased and\nutilize model-level judgment (through further prompts) to understand if a model\ncan self-diagnose the biases of its own prediction.\n","authors":["Md Abdul Aowal","Maliha T Islam","Priyanka Mary Mammen","Sandesh Shetty"],"pdf_url":"https://arxiv.org/pdf/2309.05227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05221v1","updated":"2023-09-11T03:54:38Z","published":"2023-09-11T03:54:38Z","title":"Exploring the Law of Numbers: Evidence from China's Real Estate","summary":" The renowned proverb, Numbers do not lie, underscores the reliability and\ninsight that lie beneath numbers, a concept of undisputed importance,\nespecially in economics and finance etc. Despite the prosperity of Benford's\nLaw in the first digit analysis, its scope fails to remain comprehensiveness\nwhen it comes to deciphering the laws of number. This paper delves into number\nlaws by taking the financial statements of China real estate as a\nrepresentative, quantitatively study not only the first digit, but also depict\nthe other two dimensions of numbers: frequency and length. The research\noutcomes transcend mere reservations about data manipulation and open the door\nto discussions surrounding number diversity and the delineation of the usage\ninsights. This study wields both economic significance and the capacity to\nfoster a deeper comprehension of numerical phenomena.\n","authors":["Fuqian Zhang","Zhenhua Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05221v1.pdf","comment":"DSS"},{"id":"http://arxiv.org/abs/2309.05217v1","updated":"2023-09-11T03:35:00Z","published":"2023-09-11T03:35:00Z","title":"Quantifying and Attributing the Hallucination of Large Language Models\n via Association Analysis","summary":" Although demonstrating superb performance on various NLP tasks, large\nlanguage models (LLMs) still suffer from the hallucination problem, which\nthreatens the reliability of LLMs. To measure the level of hallucination of\nLLMs, previous works first categorize the hallucination according to the\nphenomenon similarity, then quantify the proportion that model outputs contain\nhallucinatory contents. However, such hallucination rates could easily be\ndistorted by confounders. Moreover, such hallucination rates could not reflect\nthe reasons for the hallucination, as similar hallucinatory phenomena may\noriginate from different sources. To address these issues, we propose to\ncombine the hallucination level quantification and hallucination reason\ninvestigation through an association analysis, which builds the relationship\nbetween the hallucination rate of LLMs with a set of risk factors. In this way,\nwe are able to observe the hallucination level under each value of each risk\nfactor, examining the contribution and statistical significance of each risk\nfactor, meanwhile excluding the confounding effect of other factors.\nAdditionally, by recognizing the risk factors according to a taxonomy of model\ncapability, we reveal a set of potential deficiencies in commonsense\nmemorization, relational reasoning, and instruction following, which may\nfurther provide guidance for the pretraining and supervised fine-tuning process\nof LLMs to mitigate the hallucination.\n","authors":["Li Du","Yequan Wang","Xingrun Xing","Yiqun Ya","Xiang Li","Xin Jiang","Xuezhi Fang"],"pdf_url":"https://arxiv.org/pdf/2309.05217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05210v1","updated":"2023-09-11T02:58:32Z","published":"2023-09-11T02:58:32Z","title":"Understanding the Impact of Post-Training Quantization on Large-scale\n Language Models","summary":" Large language models (LLMs) are rapidly increasing in size, with the number\nof parameters becoming a key factor in the success of many commercial models,\nsuch as ChatGPT, Claude, and Bard. Even the recently released publicly\naccessible models for commercial usage, such as Falcon and Llama2, come\nequipped with billions of parameters. This significant increase in the number\nof parameters makes deployment and operation very costly. The remarkable\nprogress in the field of quantization for large neural networks in general and\nLLMs in particular, has made these models more accessible by enabling them to\nbe deployed on consumer-grade GPUs. Quantized models generally demonstrate\ncomparable performance levels to their unquantized base counterparts.\nNonetheless, there exists a notable gap in our comprehensive understanding of\nhow these quantized models respond to hyperparameters, such as temperature, max\nnew tokens, and top\\_k, particularly during the decoding phase. The present\nanalysis reveals that nf4 and fp4 are equally proficient 4-bit quantization\ntechniques, characterized by similar attributes such as inference speed, memory\nconsumption, and the quality of generated content. Nevertheless, these\nquantization methods exhibit distinct behaviors at varying temperature\nsettings, both in the context of smaller and larger models. It is noteworthy\nthat, in general, 4-bit quantized models of varying sizes exhibit heightened\nsensitivity to lower temperature settings, unlike their unquantized\ncounterparts. Additionally, int8 quantization is associated with significantly\nslower inference speeds, whereas unquantized fp16 models consistently yield the\nfastest inference speeds across models of all sizes.\n","authors":["Somnath Roy"],"pdf_url":"https://arxiv.org/pdf/2309.05210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05203v1","updated":"2023-09-11T02:35:36Z","published":"2023-09-11T02:35:36Z","title":"From Artificially Real to Real: Leveraging Pseudo Data from Large\n Language Models for Low-Resource Molecule Discovery","summary":" Molecule discovery serves as a cornerstone in numerous scientific domains,\nfueling the development of new materials and innovative drug designs. Recent\ndevelopments of in-silico molecule discovery have highlighted the promising\nresults of cross-modal techniques, which bridge molecular structures with their\ndescriptive annotations. However, these cross-modal methods frequently\nencounter the issue of data scarcity, hampering their performance and\napplication. In this paper, we address the low-resource challenge by utilizing\nartificially-real data generated by Large Language Models (LLMs). We first\nintroduce a retrieval-based prompting strategy to construct high-quality pseudo\ndata, then explore the optimal method to effectively leverage this pseudo data.\nExperiments show that using pseudo data for domain adaptation outperforms all\nexisting methods, while also requiring a smaller model scale, reduced data size\nand lower training cost, highlighting its efficiency. Furthermore, our method\nshows a sustained improvement as the volume of pseudo data increases, revealing\nthe great potential of pseudo data in advancing low-resource cross-modal\nmolecule discovery.\n","authors":["Yuhan Chen","Nuwa Xi","Yanrui Du","Haochun Wang","Chen Jianyu","Sendong Zhao","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2309.05203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05201v1","updated":"2023-09-11T02:31:41Z","published":"2023-09-11T02:31:41Z","title":"Two is Better Than One: Answering Complex Questions by Multiple\n Knowledge Sources with Generalized Links","summary":" Incorporating multiple knowledge sources is proven to be beneficial for\nanswering complex factoid questions. To utilize multiple knowledge bases (KB),\nprevious works merge all KBs into a single graph via entity alignment and\nreduce the problem to question-answering (QA) over the fused KB. In reality,\nvarious link relations between KBs might be adopted in QA over multi-KBs. In\naddition to the identity between the alignable entities (i.e. full link),\nunalignable entities expressing the different aspects or types of an abstract\nconcept may also be treated identical in a question (i.e. partial link). Hence,\nthe KB fusion in prior works fails to represent all types of links, restricting\ntheir ability to comprehend multi-KBs for QA. In this work, we formulate the\nnovel Multi-KB-QA task that leverages the full and partial links among multiple\nKBs to derive correct answers, a benchmark with diversified link and query\ntypes is also constructed to efficiently evaluate Multi-KB-QA performance.\nFinally, we propose a method for Multi-KB-QA that encodes all link relations in\nthe KB embedding to score and rank candidate answers. Experiments show that our\nmethod markedly surpasses conventional KB-QA systems in Multi-KB-QA, justifying\nthe necessity of devising this task.\n","authors":["Minhao Zhang","Yongliang Ma","Yanzeng Li","Ruoyu Zhang","Lei Zou","Ming Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.05201v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05196v1","updated":"2023-09-11T02:16:47Z","published":"2023-09-11T02:16:47Z","title":"Does Writing with Language Models Reduce Content Diversity?","summary":" Large language models (LLMs) have led to a surge in collaborative writing\nwith model assistance. As different users incorporate suggestions from the same\nmodel, there is a risk of decreased diversity in the produced content,\npotentially limiting diverse perspectives in public discourse. In this work, we\nmeasure the impact of co-writing on diversity via a controlled experiment,\nwhere users write argumentative essays in three setups -- using a base LLM\n(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We\ndevelop a set of diversity metrics and find that writing with InstructGPT (but\nnot the GPT3) results in a statistically significant reduction in diversity.\nSpecifically, it increases the similarity between the writings of different\nauthors and reduces the overall lexical and content diversity. We additionally\nfind that this effect is mainly attributable to InstructGPT contributing less\ndiverse text to co-written essays. In contrast, the user-contributed text\nremains unaffected by model collaboration. This suggests that the recent\nimprovement in generation quality from adapting models to human feedback might\ncome at the cost of more homogeneous and less diverse content.\n","authors":["Vishakh Padmakumar","He He"],"pdf_url":"https://arxiv.org/pdf/2309.05196v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2309.01717v2","updated":"2023-09-11T01:33:47Z","published":"2023-09-04T16:54:49Z","title":"Interdisciplinary Fairness in Imbalanced Research Proposal Topic\n Inference: A Hierarchical Transformer-based Method with Selective\n Interpolation","summary":" The objective of topic inference in research proposals aims to obtain the\nmost suitable disciplinary division from the discipline system defined by a\nfunding agency. The agency will subsequently find appropriate peer review\nexperts from their database based on this division. Automated topic inference\ncan reduce human errors caused by manual topic filling, bridge the knowledge\ngap between funding agencies and project applicants, and improve system\nefficiency. Existing methods focus on modeling this as a hierarchical\nmulti-label classification problem, using generative models to iteratively\ninfer the most appropriate topic information. However, these methods overlook\nthe gap in scale between interdisciplinary research proposals and\nnon-interdisciplinary ones, leading to an unjust phenomenon where the automated\ninference system categorizes interdisciplinary proposals as\nnon-interdisciplinary, causing unfairness during the expert assignment. How can\nwe address this data imbalance issue under a complex discipline system and\nhence resolve this unfairness? In this paper, we implement a topic label\ninference system based on a Transformer encoder-decoder architecture.\nFurthermore, we utilize interpolation techniques to create a series of\npseudo-interdisciplinary proposals from non-interdisciplinary ones during\ntraining based on non-parametric indicators such as cross-topic probabilities\nand topic occurrence probabilities. This approach aims to reduce the bias of\nthe system during model training. Finally, we conduct extensive experiments on\na real-world dataset to verify the effectiveness of the proposed method. The\nexperimental results demonstrate that our training strategy can significantly\nmitigate the unfairness generated in the topic inference task.\n","authors":["Meng Xiao","Min Wu","Ziyue Qiao","Yanjie Fu","Zhiyuan Ning","Yi Du","Yuanchun Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.01717v2.pdf","comment":"19 pages, Under review. arXiv admin note: text overlap with\n arXiv:2209.13912"},{"id":"http://arxiv.org/abs/2208.03886v3","updated":"2023-09-11T01:32:44Z","published":"2022-08-08T03:07:02Z","title":"What can we know about that which we cannot even imagine?","summary":" In this essay I will consider a sequence of questions. The first questions\nconcern the biological function of intelligence in general, and cognitive\nprostheses of human intelligence in particular. These will lead into questions\nconcerning human language, perhaps the most important cognitive prosthesis\nhumanity has ever developed. While it is traditional to rhapsodize about the\ncognitive power encapsulated in human language, I will emphasize how horribly\nlimited human language is -- and therefore how limited our cognitive abilities\nare, despite their being augmented with language. This will lead to questions\nof whether human mathematics, being ultimately formulated in terms of human\nlanguage, is also deeply limited. I will then combine these questions to pose a\npartial, sort-of, sideways answer to the guiding concern of this essay: what we\ncan ever discern about that we cannot even conceive?\n","authors":["David H. Wolpert"],"pdf_url":"https://arxiv.org/pdf/2208.03886v3.pdf","comment":"38 pages, 9 pages are references"},{"id":"http://arxiv.org/abs/2309.05173v1","updated":"2023-09-11T00:02:05Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":" Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving over 20% memory and time costs compared to vanilla PT\nand its variants, without changing trainable parameter sizes. Through extensive\nexperiments on 23 natural language processing (NLP) and vision-language (VL)\ntasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,\nincluding the full fine-tuning baseline in some scenarios. Additionally, we\nempirically show that DEPT grows more efficient as the model size increases.\nOur further study reveals that DePT integrates seamlessly with\nparameter-efficient transfer learning in the few-shot learning setting and\nhighlights its adaptability to various model architectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v1.pdf","comment":"Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2308.14280v2","updated":"2023-09-11T22:51:25Z","published":"2023-08-28T03:26:21Z","title":"FonMTL: Towards Multitask Learning for the Fon Language","summary":" The Fon language, spoken by an average 2 million of people, is a truly\nlow-resourced African language, with a limited online presence, and existing\ndatasets (just to name but a few). Multitask learning is a learning paradigm\nthat aims to improve the generalization capacity of a model by sharing\nknowledge across different but related tasks: this could be prevalent in very\ndata-scarce scenarios. In this paper, we present the first explorative approach\nto multitask learning, for model capabilities enhancement in Natural Language\nProcessing for the Fon language. Specifically, we explore the tasks of Named\nEntity Recognition (NER) and Part of Speech Tagging (POS) for Fon. We leverage\ntwo language model heads as encoders to build shared representations for the\ninputs, and we use linear layers blocks for classification relative to each\ntask. Our results on the NER and POS tasks for Fon, show competitive (or\nbetter) performances compared to several multilingual pretrained language\nmodels finetuned on single tasks. Additionally, we perform a few ablation\nstudies to leverage the efficiency of two different loss combination strategies\nand find out that the equal loss weighting approach works best in our case. Our\ncode is open-sourced at https://github.com/bonaventuredossou/multitask_fon.\n","authors":["Bonaventure F. P. Dossou","Iffanice Houndayi","Pamely Zantou","Gilles Hacheme"],"pdf_url":"https://arxiv.org/pdf/2308.14280v2.pdf","comment":"Accepted at WiNLP workshop, co-located at EMNLP 2023"},{"id":"http://arxiv.org/abs/2306.08753v2","updated":"2023-09-11T22:21:07Z","published":"2023-06-14T21:24:11Z","title":"Towards training Bilingual and Code-Switched Speech Recognition models\n from Monolingual data sources","summary":" Multilingual Automatic Speech Recognition (ASR) models are capable of\ntranscribing audios across multiple languages, eliminating the need for\nseparate models. In addition, they can perform Language Identification (LID)\nand handle code-switched speech. However, training these models requires\nspecial code-switch and multilingual speech corpora which are sparsely\navailable. In this paper, we evaluate different approaches towards training of\nbilingual as well as code-switched ASR models using purely monolingual data\nsources. We introduce the concept of aggregate tokenizers that differs from the\ncurrent prevalent technique of generating LIDs at the boundaries of monolingual\nsamples and produces LID for each emitted token instead. We compare bilingual\nand monolingual model performance, showcase the efficacy of aggregate\ntokenizers, present a synthetic code-switched ASR data generation technique and\ndemonstrate the effectiveness of the proposed code-switched ASR models for the\ntasks of speech recognition and spoken language identification.\n","authors":["Kunal Dhawan","Dima Rekesh","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2306.08753v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05833v1","updated":"2023-09-11T21:24:00Z","published":"2023-09-11T21:24:00Z","title":"PACE: Prompting and Augmentation for Calibrated Confidence Estimation\n with GPT-4 in Cloud Incident Root Cause Analysis","summary":" In recent years, the transition to cloud-based platforms in the IT sector has\nemphasized the significance of cloud incident root cause analysis to ensure\nservice reliability and maintain customer trust. Central to this process is the\nefficient determination of root causes, a task made challenging due to the\ncomplex nature of contemporary cloud infrastructures. Despite the proliferation\nof AI-driven tools for root cause identification, their applicability remains\nlimited by the inconsistent quality of their outputs. This paper introduces a\nmethod for enhancing confidence estimation in root cause analysis tools by\nprompting retrieval-augmented large language models (LLMs). This approach\noperates in two phases. Initially, the model evaluates its confidence based on\nhistorical incident data, considering its assessment of the evidence strength.\nSubsequently, the model reviews the root cause generated by the predictor. An\noptimization step then combines these evaluations to determine the final\nconfidence assignment. Experimental results illustrate that our method enables\nthe model to articulate its confidence effectively, providing a more calibrated\nscore. We address research questions evaluating the ability of our method to\nproduce calibrated confidence scores using LLMs, the impact of domain-specific\nretrieved examples on confidence estimates, and its potential generalizability\nacross various root cause analysis models. Through this, we aim to bridge the\nconfidence estimation gap, aiding on-call engineers in decision-making and\nbolstering the efficiency of cloud incident management.\n","authors":["Dylan Zhang","Xuchao Zhang","Chetan Bansal","Pedro Las-Casas","Rodrigo Fonseca","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2309.05833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.08501v4","updated":"2023-09-11T21:02:01Z","published":"2021-10-16T07:27:12Z","title":"Think Before You Speak: Explicitly Generating Implicit Commonsense\n Knowledge for Response Generation","summary":" Implicit knowledge, such as common sense, is key to fluid human\nconversations. Current neural response generation (RG) models are trained to\ngenerate responses directly, omitting unstated implicit knowledge. In this\npaper, we present Think-Before-Speaking (TBS), a generative approach to first\nexternalize implicit commonsense knowledge (think) and use this knowledge to\ngenerate responses (speak). We expect that externalizing implicit knowledge\nallows more efficient learning, produces more informative responses, and\nenables more explainable models. We analyze different choices to collect\nknowledge-aligned dialogues, represent implicit knowledge, and transition\nbetween knowledge and dialogues. Empirical results show TBS models outperform\nend-to-end and knowledge-augmented RG baselines on most automatic metrics and\ngenerate more informative, specific, and commonsense-following responses, as\nevaluated by human annotators. TBS also generates knowledge that makes sense\nand is relevant to the dialogue around 85\\% of the time.\n","authors":["Pei Zhou","Karthik Gopalakrishnan","Behnam Hedayatnia","Seokhwan Kim","Jay Pujara","Xiang Ren","Yang Liu","Dilek Hakkani-Tur"],"pdf_url":"https://arxiv.org/pdf/2110.08501v4.pdf","comment":"Accepted at ACL 2022 main conference. 16 pages, 9 figures, 9 tables"},{"id":"http://arxiv.org/abs/2309.03905v2","updated":"2023-09-11T20:25:16Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v2.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2309.05804v1","updated":"2023-09-11T20:16:38Z","published":"2023-09-11T20:16:38Z","title":"Hi Model, generating 'nice' instead of 'good' is not as bad as\n generating 'rice'! Towards Context and Semantic Infused Dialogue Generation\n Loss Function and Evaluation Metric","summary":" Over the past two decades, dialogue modeling has made significant strides,\nmoving from simple rule-based responses to personalized and persuasive response\ngeneration. However, despite these advancements, the objective functions and\nevaluation metrics for dialogue generation have remained stagnant, i.e.,\ncross-entropy and BLEU, respectively. These lexical-based metrics have the\nfollowing key limitations: (a) word-to-word matching without semantic\nconsideration: It assigns the same credit for failure to generate 'nice' and\n'rice' for 'good'. (b) missing context attribute for evaluating the generated\nresponse: Even if a generated response is relevant to the ongoing dialogue\ncontext, it may still be penalized for not matching the gold utterance provided\nin the corpus. In this paper, we first investigate these limitations\ncomprehensively and propose a new loss function called Semantic Infused\nContextualized diaLogue (SemTextualLogue) loss function. Furthermore, we\nformulate a new evaluation metric called Dialuation, which incorporates both\ncontext relevance and semantic appropriateness while evaluating a generated\nresponse. We conducted experiments on two benchmark dialogue corpora,\nencompassing both task-oriented and open-domain scenarios. We found that the\ndialogue generation model trained with SemTextualLogue loss attained superior\nperformance (in both quantitative and qualitative evaluation) compared to the\ntraditional cross-entropy loss function across the datasets and evaluation\nmetrics.\n","authors":["Abhisek Tiwari","Muhammed Sinan","Kaushik Roy","Amit Sheth","Sriparna Saha","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2309.05804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04640v2","updated":"2023-09-11T19:31:26Z","published":"2023-06-07T17:59:57Z","title":"ModuleFormer: Modularity Emerges from Mixture-of-Experts","summary":" Large Language Models (LLMs) have achieved remarkable results. However,\nexisting models are expensive to train and deploy, and it is also difficult to\nexpand their knowledge beyond pre-training data without forgetting previous\nknowledge. This paper proposes a new neural network architecture, ModuleFormer,\nthat leverages modularity to improve the efficiency and flexibility of large\nlanguage models. ModuleFormer is based on the Sparse Mixture of Experts (SMoE).\nUnlike the previous SMoE-based modular language model, which requires\ndomain-labeled data to learn domain-specific experts, ModuleFormer can induce\nmodularity from uncurated data with its new load balancing and concentration\nlosses. ModuleFormer is a modular architecture that includes two different\ntypes of modules: new stick-breaking attention heads and feedforward experts.\nDifferent modules are sparsely activated conditions on the input token during\ntraining and inference. In our experiment, we found that the modular\narchitecture enables three important abilities for large pre-trained language\nmodels: 1) Efficiency, since ModuleFormer only activates a subset of its\nmodules for each input token, thus it could achieve the same performance as\ndense LLMs with more than two times throughput; 2) Extendability, ModuleFormer\nis more immune to catastrophic forgetting than dense LLMs and can be easily\nextended with new modules to learn new knowledge that is not included in the\ntraining data; 3) Specialisation, finetuning ModuleFormer could specialize a\nsubset of modules to the finetuning task and the task-unrelated modules could\nbe easily pruned for a lightweight deployment.\n","authors":["Yikang Shen","Zheyu Zhang","Tianyou Cao","Shawn Tan","Zhenfang Chen","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2306.04640v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10403v2","updated":"2023-09-11T18:42:20Z","published":"2023-05-17T17:46:53Z","title":"PaLM 2 Technical Report","summary":" We introduce PaLM 2, a new state-of-the-art language model that has better\nmultilingual and reasoning capabilities and is more compute-efficient than its\npredecessor PaLM. PaLM 2 is a Transformer-based model trained using a mixture\nof objectives. Through extensive evaluations on English and multilingual\nlanguage, and reasoning tasks, we demonstrate that PaLM 2 has significantly\nimproved quality on downstream tasks across different model sizes, while\nsimultaneously exhibiting faster and more efficient inference compared to PaLM.\nThis improved efficiency enables broader deployment while also allowing the\nmodel to respond faster, for a more natural pace of interaction. PaLM 2\ndemonstrates robust reasoning capabilities exemplified by large improvements\nover PaLM on BIG-Bench and other reasoning tasks. PaLM 2 exhibits stable\nperformance on a suite of responsible AI evaluations, and enables\ninference-time control over toxicity without additional overhead or impact on\nother capabilities. Overall, PaLM 2 achieves state-of-the-art performance\nacross a diverse set of tasks and capabilities.\n When discussing the PaLM 2 family, it is important to distinguish between\npre-trained models (of various sizes), fine-tuned variants of these models, and\nthe user-facing products that use these models. In particular, user-facing\nproducts typically include additional pre- and post-processing steps.\nAdditionally, the underlying models may evolve over time. Therefore, one should\nnot expect the performance of user-facing products to exactly match the results\nreported in this report.\n","authors":[" Google"," :","Rohan Anil","Andrew M. Dai","Orhan Firat","Melvin Johnson","Dmitry Lepikhin","Alexandre Passos","Siamak Shakeri","Emanuel Taropa","Paige Bailey","Zhifeng Chen","Eric Chu","Jonathan H. Clark","Laurent El Shafey","Yanping Huang","Kathy Meier-Hellstern","Gaurav Mishra","Erica Moreira","Mark Omernick","Kevin Robinson","Sebastian Ruder","Yi Tay","Kefan Xiao","Yuanzhong Xu","Yujing Zhang","Gustavo Hernandez Abrego","Junwhan Ahn","Jacob Austin","Paul Barham","Jan Botha","James Bradbury","Siddhartha Brahma","Kevin Brooks","Michele Catasta","Yong Cheng","Colin Cherry","Christopher A. Choquette-Choo","Aakanksha Chowdhery","Clément Crepy","Shachi Dave","Mostafa Dehghani","Sunipa Dev","Jacob Devlin","Mark Díaz","Nan Du","Ethan Dyer","Vlad Feinberg","Fangxiaoyu Feng","Vlad Fienber","Markus Freitag","Xavier Garcia","Sebastian Gehrmann","Lucas Gonzalez","Guy Gur-Ari","Steven Hand","Hadi Hashemi","Le Hou","Joshua Howland","Andrea Hu","Jeffrey Hui","Jeremy Hurwitz","Michael Isard","Abe Ittycheriah","Matthew Jagielski","Wenhao Jia","Kathleen Kenealy","Maxim Krikun","Sneha Kudugunta","Chang Lan","Katherine Lee","Benjamin Lee","Eric Li","Music Li","Wei Li","YaGuang Li","Jian Li","Hyeontaek Lim","Hanzhao Lin","Zhongtao Liu","Frederick Liu","Marcello Maggioni","Aroma Mahendru","Joshua Maynez","Vedant Misra","Maysam Moussalem","Zachary Nado","John Nham","Eric Ni","Andrew Nystrom","Alicia Parrish","Marie Pellat","Martin Polacek","Alex Polozov","Reiner Pope","Siyuan Qiao","Emily Reif","Bryan Richter","Parker Riley","Alex Castro Ros","Aurko Roy","Brennan Saeta","Rajkumar Samuel","Renee Shelby","Ambrose Slone","Daniel Smilkov","David R. So","Daniel Sohn","Simon Tokumine","Dasha Valter","Vijay Vasudevan","Kiran Vodrahalli","Xuezhi Wang","Pidong Wang","Zirui Wang","Tao Wang","John Wieting","Yuhuai Wu","Kelvin Xu","Yunhan Xu","Linting Xue","Pengcheng Yin","Jiahui Yu","Qiao Zhang","Steven Zheng","Ce Zheng","Weikang Zhou","Denny Zhou","Slav Petrov","Yonghui Wu"],"pdf_url":"https://arxiv.org/pdf/2305.10403v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05689v1","updated":"2023-09-11T17:49:27Z","published":"2023-09-11T17:49:27Z","title":"Large Language Model for Science: A Study on P vs. NP","summary":" In this work, we use large language models (LLMs) to augment and accelerate\nresearch on the P versus NP problem, one of the most important open problems in\ntheoretical computer science and mathematics. Specifically, we propose Socratic\nreasoning, a general framework that promotes in-depth thinking with LLMs for\ncomplex problem-solving. Socratic reasoning encourages LLMs to recursively\ndiscover, solve, and integrate problems while facilitating self-evaluation and\nrefinement. Our pilot study on the P vs. NP problem shows that GPT-4\nsuccessfully produces a proof schema and engages in rigorous reasoning\nthroughout 97 dialogue turns, concluding \"P $\\neq$ NP\", which is in alignment\nwith (Xu and Zhou, 2023). The investigation uncovers novel insights within the\nextensive solution space of LLMs, shedding light on LLM for Science.\n","authors":["Qingxiu Dong","Li Dong","Ke Xu","Guangyan Zhou","Yaru Hao","Zhifang Sui","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2309.05689v1.pdf","comment":"73 pages"},{"id":"http://arxiv.org/abs/2309.07062v1","updated":"2023-09-11T22:11:46Z","published":"2023-09-11T22:11:46Z","title":"Large Language Models for Compiler Optimization","summary":" We explore the novel application of Large Language Models to code\noptimization. We present a 7B-parameter transformer model trained from scratch\nto optimize LLVM assembly for code size. The model takes as input unoptimized\nassembly and outputs a list of compiler options to best optimize the program.\nCrucially, during training, we ask the model to predict the instruction counts\nbefore and after optimization, and the optimized code itself. These auxiliary\nlearning tasks significantly improve the optimization performance of the model\nand improve the model's depth of understanding.\n We evaluate on a large suite of test programs. Our approach achieves a 3.0%\nimprovement in reducing instruction counts over the compiler, outperforming two\nstate-of-the-art baselines that require thousands of compilations. Furthermore,\nthe model shows surprisingly strong code reasoning abilities, generating\ncompilable code 91% of the time and perfectly emulating the output of the\ncompiler 70% of the time.\n","authors":["Chris Cummins","Volker Seeker","Dejan Grubisic","Mostafa Elhoushi","Youwei Liang","Baptiste Roziere","Jonas Gehring","Fabian Gloeckle","Kim Hazelwood","Gabriel Synnaeve","Hugh Leather"],"pdf_url":"https://arxiv.org/pdf/2309.07062v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.05665v1","updated":"2023-09-11T17:59:17Z","published":"2023-09-11T17:59:17Z","title":"Robot Parkour Learning","summary":" Parkour is a grand challenge for legged locomotion that requires robots to\novercome various obstacles rapidly in complex environments. Existing methods\ncan generate either diverse but blind locomotion skills or vision-based but\nspecialized skills by using reference animal data or complex rewards. However,\nautonomous parkour requires robots to learn generalizable skills that are both\nvision-based and diverse to perceive and react to various scenarios. In this\nwork, we propose a system for learning a single end-to-end vision-based parkour\npolicy of diverse parkour skills using a simple reward without any reference\nmotion data. We develop a reinforcement learning method inspired by direct\ncollocation to generate parkour skills, including climbing over high obstacles,\nleaping over large gaps, crawling beneath low barriers, squeezing through thin\nslits, and running. We distill these skills into a single vision-based parkour\npolicy and transfer it to a quadrupedal robot using its egocentric depth\ncamera. We demonstrate that our system can empower two different low-cost\nrobots to autonomously select and execute appropriate parkour skills to\ntraverse challenging real-world environments.\n","authors":["Ziwen Zhuang","Zipeng Fu","Jianren Wang","Christopher Atkeson","Soeren Schwertfeger","Chelsea Finn","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05665v1.pdf","comment":"CoRL 2023 (Oral). Project website at https://robot-parkour.github.io"},{"id":"http://arxiv.org/abs/2309.05663v1","updated":"2023-09-11T17:58:30Z","published":"2023-09-11T17:58:30Z","title":"Diffusion-Guided Reconstruction of Everyday Hand-Object Interaction\n Clips","summary":" We tackle the task of reconstructing hand-object interactions from short\nvideo clips. Given an input video, our approach casts 3D inference as a\nper-video optimization and recovers a neural 3D representation of the object\nshape, as well as the time-varying motion and hand articulation. While the\ninput video naturally provides some multi-view cues to guide 3D inference,\nthese are insufficient on their own due to occlusions and limited viewpoint\nvariations. To obtain accurate 3D, we augment the multi-view signals with\ngeneric data-driven priors to guide reconstruction. Specifically, we learn a\ndiffusion network to model the conditional distribution of (geometric)\nrenderings of objects conditioned on hand configuration and category label, and\nleverage it as a prior to guide the novel-view renderings of the reconstructed\nscene. We empirically evaluate our approach on egocentric videos across 6\nobject categories, and observe significant improvements over prior single-view\nand multi-view methods. Finally, we demonstrate our system's ability to\nreconstruct arbitrary clips from YouTube, showing both 1st and 3rd person\ninteractions.\n","authors":["Yufei Ye","Poorvi Hebbar","Abhinav Gupta","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2309.05663v1.pdf","comment":"Accepted to ICCV23 (Oral). Project Page:\n https://judyye.github.io/diffhoi-www/"},{"id":"http://arxiv.org/abs/2309.05662v1","updated":"2023-09-11T17:58:14Z","published":"2023-09-11T17:58:14Z","title":"ViHOPE: Visuotactile In-Hand Object 6D Pose Estimation with Shape\n Completion","summary":" In this letter, we introduce ViHOPE, a novel framework for estimating the 6D\npose of an in-hand object using visuotactile perception. Our key insight is\nthat the accuracy of the 6D object pose estimate can be improved by explicitly\ncompleting the shape of the object. To this end, we introduce a novel\nvisuotactile shape completion module that uses a conditional Generative\nAdversarial Network to complete the shape of an in-hand object based on\nvolumetric representation. This approach improves over prior works that\ndirectly regress visuotactile observations to a 6D pose. By explicitly\ncompleting the shape of the in-hand object and jointly optimizing the shape\ncompletion and pose estimation tasks, we improve the accuracy of the 6D object\npose estimate. We train and test our model on a synthetic dataset and compare\nit with the state-of-the-art. In the visuotactile shape completion task, we\noutperform the state-of-the-art by 265% using the Intersection of Union metric\nand achieve 88% lower Chamfer Distance. In the visuotactile pose estimation\ntask, we present results that suggest our framework reduces position and\nangular errors by 35% and 64%, respectively. Furthermore, we ablate our\nframework to confirm the gain on the 6D object pose estimate from explicitly\ncompleting the shape. Ultimately, we show that our framework produces models\nthat are robust to sim-to-real transfer on a real-world robot platform.\n","authors":["Hongyu Li","Snehal Dikhale","Soshi Iba","Nawid Jamali"],"pdf_url":"https://arxiv.org/pdf/2309.05662v1.pdf","comment":"Accepted by RA-L"},{"id":"http://arxiv.org/abs/2112.09976v2","updated":"2023-09-11T17:57:15Z","published":"2021-12-18T17:44:07Z","title":"Tell me what you see: A zero-shot action recognition method based on\n natural language descriptions","summary":" This paper presents a novel approach to Zero-Shot Action Recognition. Recent\nworks have explored the detection and classification of objects to obtain\nsemantic information from videos with remarkable performance. Inspired by them,\nwe propose using video captioning methods to extract semantic information about\nobjects, scenes, humans, and their relationships. To the best of our knowledge,\nthis is the first work to represent both videos and labels with descriptive\nsentences. More specifically, we represent videos using sentences generated via\nvideo captioning methods and classes using sentences extracted from documents\nacquired through search engines on the Internet. Using these representations,\nwe build a shared semantic space employing BERT-based embedders pre-trained in\nthe paraphrasing task on multiple text datasets. The projection of both visual\nand semantic information onto this space is straightforward, as they are\nsentences, enabling classification using the nearest neighbor rule. We\ndemonstrate that representing videos and labels with sentences alleviates the\ndomain adaptation problem. Additionally, we show that word vectors are\nunsuitable for building the semantic embedding space of our descriptions. Our\nmethod outperforms the state-of-the-art performance on the UCF101 dataset by\n3.3 p.p. in accuracy under the TruZe protocol and achieves competitive results\non both the UCF101 and HMDB51 datasets under the conventional protocol (0/50\\%\n- training/testing split). Our code is available at\nhttps://github.com/valterlej/zsarcap.\n","authors":["Valter Estevam","Rayson Laroca","David Menotti","Helio Pedrini"],"pdf_url":"https://arxiv.org/pdf/2112.09976v2.pdf","comment":"Published at Multimedia Tools and Applications"},{"id":"http://arxiv.org/abs/2308.13369v2","updated":"2023-09-11T17:55:50Z","published":"2023-08-25T13:29:31Z","title":"Distribution-Aligned Diffusion for Human Mesh Recovery","summary":" Recovering a 3D human mesh from a single RGB image is a challenging task due\nto depth ambiguity and self-occlusion, resulting in a high degree of\nuncertainty. Meanwhile, diffusion models have recently seen much success in\ngenerating high-quality outputs by progressively denoising noisy inputs.\nInspired by their capability, we explore a diffusion-based approach for human\nmesh recovery, and propose a Human Mesh Diffusion (HMDiff) framework which\nframes mesh recovery as a reverse diffusion process. We also propose a\nDistribution Alignment Technique (DAT) that infuses prior distribution\ninformation into the mesh distribution diffusion process, and provides useful\nprior knowledge to facilitate the mesh recovery task. Our method achieves\nstate-of-the-art performance on three widely used datasets. Project page:\nhttps://gongjia0208.github.io/HMDiff/.\n","authors":["Lin Geng Foo","Jia Gong","Hossein Rahmani","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13369v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05652v1","updated":"2023-09-11T17:43:11Z","published":"2023-09-11T17:43:11Z","title":"An Effective Two-stage Training Paradigm Detector for Small Dataset","summary":" Learning from the limited amount of labeled data to the pre-train model has\nalways been viewed as a challenging task. In this report, an effective and\nrobust solution, the two-stage training paradigm YOLOv8 detector (TP-YOLOv8),\nis designed for the object detection track in VIPriors Challenge 2023. First,\nthe backbone of YOLOv8 is pre-trained as the encoder using the masked image\nmodeling technique. Then the detector is fine-tuned with elaborate\naugmentations. During the test stage, test-time augmentation (TTA) is used to\nenhance each model, and weighted box fusion (WBF) is implemented to further\nboost the performance. With the well-designed structure, our approach has\nachieved 30.4% average precision from 0.50 to 0.95 on the DelftBikes test set,\nranking 4th on the leaderboard.\n","authors":["Zheng Wang","Dong Xie","Hanzhi Wang","Jiang Tian"],"pdf_url":"https://arxiv.org/pdf/2309.05652v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.05645v1","updated":"2023-09-11T17:37:08Z","published":"2023-09-11T17:37:08Z","title":"CitDet: A Benchmark Dataset for Citrus Fruit Detection","summary":" In this letter, we present a new dataset to advance the state of the art in\ndetecting citrus fruit and accurately estimate yield on trees affected by the\nHuanglongbing (HLB) disease in orchard environments via imaging. Despite the\nfact that significant progress has been made in solving the fruit detection\nproblem, the lack of publicly available datasets has complicated direct\ncomparison of results. For instance, citrus detection has long been of interest\nin the agricultural research community, yet there is an absence of work,\nparticularly involving public datasets of citrus affected by HLB. To address\nthis issue, we enhance state-of-the-art object detection methods for use in\ntypical orchard settings. Concretely, we provide high-resolution images of\ncitrus trees located in an area known to be highly affected by HLB, along with\nhigh-quality bounding box annotations of citrus fruit. Fruit on both the trees\nand the ground are labeled to allow for identification of fruit location, which\ncontributes to advancements in yield estimation and potential measure of HLB\nimpact via fruit drop. The dataset consists of over 32,000 bounding box\nannotations for fruit instances contained in 579 high-resolution images. In\nsummary, our contributions are the following: (i) we introduce a novel dataset\nalong with baseline performance benchmarks on multiple contemporary object\ndetection algorithms, (ii) we show the ability to accurately capture fruit\nlocation on tree or on ground, and finally (ii) we present a correlation of our\nresults with yield estimations.\n","authors":["Jordan A. James","Heather K. Manching","Matthew R. Mattia","Kim D. Bowman","Amanda M. Hulse-Kemp","William J. Beksi"],"pdf_url":"https://arxiv.org/pdf/2309.05645v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters (RA-L)"},{"id":"http://arxiv.org/abs/2212.05102v3","updated":"2023-09-11T17:09:03Z","published":"2022-12-09T20:03:59Z","title":"A soft nearest-neighbor framework for continual semi-supervised learning","summary":" Despite significant advances, the performance of state-of-the-art continual\nlearning approaches hinges on the unrealistic scenario of fully labeled data.\nIn this paper, we tackle this challenge and propose an approach for continual\nsemi-supervised learning--a setting where not all the data samples are labeled.\nA primary issue in this scenario is the model forgetting representations of\nunlabeled data and overfitting the labeled samples. We leverage the power of\nnearest-neighbor classifiers to nonlinearly partition the feature space and\nflexibly model the underlying data distribution thanks to its non-parametric\nnature. This enables the model to learn a strong representation for the current\ntask, and distill relevant information from previous tasks. We perform a\nthorough experimental evaluation and show that our method outperforms all the\nexisting approaches by large margins, setting a solid state of the art on the\ncontinual semi-supervised learning paradigm. For example, on CIFAR-100 we\nsurpass several others even when using at least 30 times less supervision (0.8%\nvs. 25% of annotations). Finally, our method works well on both low and high\nresolution images and scales seamlessly to more complex datasets such as\nImageNet-100. The code is publicly available on\nhttps://github.com/kangzhiq/NNCSL\n","authors":["Zhiqi Kang","Enrico Fini","Moin Nabi","Elisa Ricci","Karteek Alahari"],"pdf_url":"https://arxiv.org/pdf/2212.05102v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2303.17368v2","updated":"2023-09-11T17:06:27Z","published":"2023-03-30T13:30:12Z","title":"SynBody: Synthetic Dataset with Layered Human Models for 3D Human\n Perception and Modeling","summary":" Synthetic data has emerged as a promising source for 3D human research as it\noffers low-cost access to large-scale human datasets. To advance the diversity\nand annotation quality of human models, we introduce a new synthetic dataset,\nSynBody, with three appealing features: 1) a clothed parametric human model\nthat can generate a diverse range of subjects; 2) the layered human\nrepresentation that naturally offers high-quality 3D annotations to support\nmultiple tasks; 3) a scalable system for producing realistic data to facilitate\nreal-world tasks. The dataset comprises 1.2M images with corresponding accurate\n3D annotations, covering 10,000 human body models, 1,187 actions, and various\nviewpoints. The dataset includes two subsets for human pose and shape\nestimation as well as human neural rendering. Extensive experiments on SynBody\nindicate that it substantially enhances both SMPL and SMPL-X estimation.\nFurthermore, the incorporation of layered annotations offers a valuable\ntraining resource for investigating the Human Neural Radiance Fields (NeRF).\n","authors":["Zhitao Yang","Zhongang Cai","Haiyi Mei","Shuai Liu","Zhaoxi Chen","Weiye Xiao","Yukun Wei","Zhongfei Qing","Chen Wei","Bo Dai","Wayne Wu","Chen Qian","Dahua Lin","Ziwei Liu","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2303.17368v2.pdf","comment":"Accepted by ICCV 2023. Project webpage: https://synbody.github.io/"},{"id":"http://arxiv.org/abs/2309.05613v1","updated":"2023-09-11T16:54:34Z","published":"2023-09-11T16:54:34Z","title":"Learning the Geodesic Embedding with Graph Neural Networks","summary":" We present GeGnn, a learning-based method for computing the approximate\ngeodesic distance between two arbitrary points on discrete polyhedra surfaces\nwith constant time complexity after fast precomputation. Previous relevant\nmethods either focus on computing the geodesic distance between a single source\nand all destinations, which has linear complexity at least or require a long\nprecomputation time. Our key idea is to train a graph neural network to embed\nan input mesh into a high-dimensional embedding space and compute the geodesic\ndistance between a pair of points using the corresponding embedding vectors and\na lightweight decoding function. To facilitate the learning of the embedding,\nwe propose novel graph convolution and graph pooling modules that incorporate\nlocal geodesic information and are verified to be much more effective than\nprevious designs. After training, our method requires only one forward pass of\nthe network per mesh as precomputation. Then, we can compute the geodesic\ndistance between a pair of points using our decoding function, which requires\nonly several matrix multiplications and can be massively parallelized on GPUs.\nWe verify the efficiency and effectiveness of our method on ShapeNet and\ndemonstrate that our method is faster than existing methods by orders of\nmagnitude while achieving comparable or better accuracy. Additionally, our\nmethod exhibits robustness on noisy and incomplete meshes and strong\ngeneralization ability on out-of-distribution meshes. The code and pretrained\nmodel can be found on https://github.com/IntelligentGeometry/GeGnn.\n","authors":["Bo Pang","Zhongtian Zheng","Guoping Wang","Peng-Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05613v1.pdf","comment":"SIGGRAPH Asia 2023, Journal Track"},{"id":"http://arxiv.org/abs/2211.14096v2","updated":"2023-09-11T16:44:52Z","published":"2022-11-25T13:25:18Z","title":"Deep grading for MRI-based differential diagnosis of Alzheimer's disease\n and Frontotemporal dementia","summary":" Alzheimer's disease and Frontotemporal dementia are common forms of\nneurodegenerative dementia. Behavioral alterations and cognitive impairments\nare found in the clinical courses of both diseases and their differential\ndiagnosis is sometimes difficult for physicians. Therefore, an accurate tool\ndedicated to this diagnostic challenge can be valuable in clinical practice.\nHowever, current structural imaging methods mainly focus on the detection of\neach disease but rarely on their differential diagnosis. In this paper, we\npropose a deep learning based approach for both problems of disease detection\nand differential diagnosis. We suggest utilizing two types of biomarkers for\nthis application: structure grading and structure atrophy. First, we propose to\ntrain a large ensemble of 3D U-Nets to locally determine the anatomical\npatterns of healthy people, patients with Alzheimer's disease and patients with\nFrontotemporal dementia using structural MRI as input. The output of the\nensemble is a 2-channel disease's coordinate map able to be transformed into a\n3D grading map which is easy to interpret for clinicians. This 2-channel map is\ncoupled with a multi-layer perceptron classifier for different classification\ntasks. Second, we propose to combine our deep learning framework with a\ntraditional machine learning strategy based on volume to improve the model\ndiscriminative capacity and robustness. After both cross-validation and\nexternal validation, our experiments based on 3319 MRI demonstrated competitive\nresults of our method compared to the state-of-the-art methods for both disease\ndetection and differential diagnosis.\n","authors":["Huy-Dung Nguyen","Michaël Clément","Vincent Planche","Boris Mansencal","Pierrick Coupé"],"pdf_url":"https://arxiv.org/pdf/2211.14096v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03605v7","updated":"2023-09-11T16:31:55Z","published":"2021-10-07T16:33:11Z","title":"Robust Feature-Level Adversaries are Interpretability Tools","summary":" The literature on adversarial attacks in computer vision typically focuses on\npixel-level perturbations. These tend to be very difficult to interpret. Recent\nwork that manipulates the latent representations of image generators to create\n\"feature-level\" adversarial perturbations gives us an opportunity to explore\nperceptible, interpretable adversarial attacks. We make three contributions.\nFirst, we observe that feature-level attacks provide useful classes of inputs\nfor studying representations in models. Second, we show that these adversaries\nare uniquely versatile and highly robust. We demonstrate that they can be used\nto produce targeted, universal, disguised, physically-realizable, and black-box\nattacks at the ImageNet scale. Third, we show how these adversarial images can\nbe used as a practical interpretability tool for identifying bugs in networks.\nWe use these adversaries to make predictions about spurious associations\nbetween features and classes which we then test by designing \"copy/paste\"\nattacks in which one natural image is pasted into another to cause a targeted\nmisclassification. Our results suggest that feature-level attacks are a\npromising approach for rigorous interpretability research. They support the\ndesign of tools to better understand what a model has learned and diagnose\nbrittle feature associations. Code is available at\nhttps://github.com/thestephencasper/feature_level_adv\n","authors":["Stephen Casper","Max Nadeau","Dylan Hadfield-Menell","Gabriel Kreiman"],"pdf_url":"https://arxiv.org/pdf/2110.03605v7.pdf","comment":"NeurIPS 2022, code available at\n https://github.com/thestephencasper/feature_level_adv"},{"id":"http://arxiv.org/abs/2309.05590v1","updated":"2023-09-11T16:17:50Z","published":"2023-09-11T16:17:50Z","title":"Temporal Action Localization with Enhanced Instant Discriminability","summary":" Temporal action detection (TAD) aims to detect all action boundaries and\ntheir corresponding categories in an untrimmed video. The unclear boundaries of\nactions in videos often result in imprecise predictions of action boundaries by\nexisting methods. To resolve this issue, we propose a one-stage framework named\nTriDet. First, we propose a Trident-head to model the action boundary via an\nestimated relative probability distribution around the boundary. Then, we\nanalyze the rank-loss problem (i.e. instant discriminability deterioration) in\ntransformer-based methods and propose an efficient scalable-granularity\nperception (SGP) layer to mitigate this issue. To further push the limit of\ninstant discriminability in the video backbone, we leverage the strong\nrepresentation capability of pretrained large models and investigate their\nperformance on TAD. Last, considering the adequate spatial-temporal context for\nclassification, we design a decoupled feature pyramid network with separate\nfeature pyramids to incorporate rich spatial context from the large model for\nlocalization. Experimental results demonstrate the robustness of TriDet and its\nstate-of-the-art performance on multiple TAD datasets, including hierarchical\n(multilabel) TAD datasets.\n","authors":["Dingfeng Shi","Qiong Cao","Yujie Zhong","Shan An","Jian Cheng","Haogang Zhu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2309.05590v1.pdf","comment":"An extended version of the CVPR paper arXiv:2303.07347, submitted to\n IJCV"},{"id":"http://arxiv.org/abs/2309.05573v1","updated":"2023-09-11T16:00:22Z","published":"2023-09-11T16:00:22Z","title":"UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the\n OpenPCSeg Codebase","summary":" Point-, voxel-, and range-views are three representative forms of point\nclouds. All of them have accurate 3D measurements but lack color and texture\ninformation. RGB images are a natural complement to these point cloud views and\nfully utilizing the comprehensive information of them benefits more robust\nperceptions. In this paper, we present a unified multi-modal LiDAR segmentation\nnetwork, termed UniSeg, which leverages the information of RGB images and three\nviews of the point cloud, and accomplishes semantic segmentation and panoptic\nsegmentation simultaneously. Specifically, we first design the Learnable\ncross-Modal Association (LMA) module to automatically fuse voxel-view and\nrange-view features with image features, which fully utilize the rich semantic\ninformation of images and are robust to calibration errors. Then, the enhanced\nvoxel-view and range-view features are transformed to the point space,where\nthree views of point cloud features are further fused adaptively by the\nLearnable cross-View Association module (LVA). Notably, UniSeg achieves\npromising results in three public benchmarks, i.e., SemanticKITTI, nuScenes,\nand Waymo Open Dataset (WOD); it ranks 1st on two challenges of two benchmarks,\nincluding the LiDAR semantic segmentation challenge of nuScenes and panoptic\nsegmentation challenges of SemanticKITTI. Besides, we construct the OpenPCSeg\ncodebase, which is the largest and most comprehensive outdoor LiDAR\nsegmentation codebase. It contains most of the popular outdoor LiDAR\nsegmentation algorithms and provides reproducible implementations. The\nOpenPCSeg codebase will be made publicly available at\nhttps://github.com/PJLab-ADG/PCSeg.\n","authors":["Youquan Liu","Runnan Chen","Xin Li","Lingdong Kong","Yuchen Yang","Zhaoyang Xia","Yeqi Bai","Xinge Zhu","Yuexin Ma","Yikang Li","Yu Qiao","Yuenan Hou"],"pdf_url":"https://arxiv.org/pdf/2309.05573v1.pdf","comment":"ICCV 2023; 21 pages; 9 figures; 18 tables; Code at\n https://github.com/PJLab-ADG/PCSeg"},{"id":"http://arxiv.org/abs/2309.05569v1","updated":"2023-09-11T15:54:30Z","published":"2023-09-11T15:54:30Z","title":"ITI-GEN: Inclusive Text-to-Image Generation","summary":" Text-to-image generative models often reflect the biases of the training\ndata, leading to unequal representations of underrepresented groups. This study\ninvestigates inclusive text-to-image generative models that generate images\nbased on human-written prompts and ensure the resulting images are uniformly\ndistributed across attributes of interest. Unfortunately, directly expressing\nthe desired attributes in the prompt often leads to sub-optimal results due to\nlinguistic ambiguity or model misrepresentation. Hence, this paper proposes a\ndrastically different approach that adheres to the maxim that \"a picture is\nworth a thousand words\". We show that, for some attributes, images can\nrepresent concepts more expressively than text. For instance, categories of\nskin tones are typically hard to specify by text but can be easily represented\nby example images. Building upon these insights, we propose a novel approach,\nITI-GEN, that leverages readily available reference images for Inclusive\nText-to-Image GENeration. The key idea is learning a set of prompt embeddings\nto generate images that can effectively represent all desired attribute\ncategories. More importantly, ITI-GEN requires no model fine-tuning, making it\ncomputationally efficient to augment existing text-to-image models. Extensive\nexperiments demonstrate that ITI-GEN largely improves over state-of-the-art\nmodels to generate inclusive images from a prompt. Project page:\nhttps://czhang0528.github.io/iti-gen.\n","authors":["Cheng Zhang","Xuanbai Chen","Siqi Chai","Chen Henry Wu","Dmitry Lagun","Thabo Beeler","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2309.05569v1.pdf","comment":"Accepted to ICCV 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2309.05551v1","updated":"2023-09-11T15:36:03Z","published":"2023-09-11T15:36:03Z","title":"OpenFashionCLIP: Vision-and-Language Contrastive Learning with\n Open-Source Fashion Data","summary":" The inexorable growth of online shopping and e-commerce demands scalable and\nrobust machine learning-based solutions to accommodate customer requirements.\nIn the context of automatic tagging classification and multimodal retrieval,\nprior works either defined a low generalizable supervised learning approach or\nmore reusable CLIP-based techniques while, however, training on closed source\ndata. In this work, we propose OpenFashionCLIP, a vision-and-language\ncontrastive learning method that only adopts open-source fashion data stemming\nfrom diverse domains, and characterized by varying degrees of specificity. Our\napproach is extensively validated across several tasks and benchmarks, and\nexperimental results highlight a significant out-of-domain generalization\ncapability and consistent improvements over state-of-the-art methods both in\nterms of accuracy and recall. Source code and trained models are publicly\navailable at: https://github.com/aimagelab/open-fashion-clip.\n","authors":["Giuseppe Cartella","Alberto Baldrati","Davide Morelli","Marcella Cornia","Marco Bertini","Rita Cucchiara"],"pdf_url":"https://arxiv.org/pdf/2309.05551v1.pdf","comment":"International Conference on Image Analysis and Processing (ICIAP)\n 2023"},{"id":"http://arxiv.org/abs/2309.05548v1","updated":"2023-09-11T15:33:00Z","published":"2023-09-11T15:33:00Z","title":"Distance-Aware eXplanation Based Learning","summary":" eXplanation Based Learning (XBL) is an interactive learning approach that\nprovides a transparent method of training deep learning models by interacting\nwith their explanations. XBL augments loss functions to penalize a model based\non deviation of its explanations from user annotation of image features. The\nliterature on XBL mostly depends on the intersection of visual model\nexplanations and image feature annotations. We present a method to add a\ndistance-aware explanation loss to categorical losses that trains a learner to\nfocus on important regions of a training dataset. Distance is an appropriate\napproach for calculating explanation loss since visual model explanations such\nas Gradient-weighted Class Activation Mapping (Grad-CAMs) are not strictly\nbounded as annotations and their intersections may not provide complete\ninformation on the deviation of a model's focus from relevant image regions. In\naddition to assessing our model using existing metrics, we propose an\ninterpretability metric for evaluating visual feature-attribution based model\nexplanations that is more informative of the model's performance than existing\nmetrics. We demonstrate performance of our proposed method on three image\nclassification tasks.\n","authors":["Misgina Tsighe Hagos","Niamh Belton","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2309.05548v1.pdf","comment":"Accepted at the 35th IEEE International Conference on Tools with\n Artificial Intelligence, ICTAI 2023"},{"id":"http://arxiv.org/abs/2303.06015v3","updated":"2023-09-11T15:26:36Z","published":"2023-03-10T16:19:34Z","title":"Dynamic Y-KD: A Hybrid Approach to Continual Instance Segmentation","summary":" Despite the success of deep learning models on instance segmentation, current\nmethods still suffer from catastrophic forgetting in continual learning\nscenarios. In this paper, our contributions for continual instance segmentation\nare threefold. First, we propose the Y-knowledge distillation (Y-KD), a\ntechnique that shares a common feature extractor between the teacher and\nstudent networks. As the teacher is also updated with new data in Y-KD, the\nincreased plasticity results in new modules that are specialized on new\nclasses. Second, our Y-KD approach is supported by a dynamic architecture\nmethod that trains task-specific modules with a unique instance segmentation\nhead, thereby significantly reducing forgetting. Third, we complete our\napproach by leveraging checkpoint averaging as a simple method to manually\nbalance the trade-off between performance on the various sets of classes, thus\nincreasing control over the model's behavior without any additional cost. These\ncontributions are united in our model that we name the Dynamic Y-KD network.\n We perform extensive experiments on several single-step and multi-steps\nincremental learning scenarios, and we show that our approach outperforms\nprevious methods both on past and new classes. For instance, compared to recent\nwork, our method obtains +2.1% mAP on old classes in 15-1, +7.6% mAP on new\nclasses in 19-1 and reaches 91.5% of the mAP obtained by joint-training on all\nclasses in 15-5.\n","authors":["Mathieu Pagé-Fortin","Brahim Chaib-draa"],"pdf_url":"https://arxiv.org/pdf/2303.06015v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05534v1","updated":"2023-09-11T15:18:28Z","published":"2023-09-11T15:18:28Z","title":"PAI-Diffusion: Constructing and Serving a Family of Open Chinese\n Diffusion Models for Text-to-image Synthesis on the Cloud","summary":" Text-to-image synthesis for the Chinese language poses unique challenges due\nto its large vocabulary size, and intricate character relationships. While\nexisting diffusion models have shown promise in generating images from textual\ndescriptions, they often neglect domain-specific contexts and lack robustness\nin handling the Chinese language. This paper introduces PAI-Diffusion, a\ncomprehensive framework that addresses these limitations. PAI-Diffusion\nincorporates both general and domain-specific Chinese diffusion models,\nenabling the generation of contextually relevant images. It explores the\npotential of using LoRA and ControlNet for fine-grained image style transfer\nand image editing, empowering users with enhanced control over image\ngeneration. Moreover, PAI-Diffusion seamlessly integrates with Alibaba Cloud's\nMachine Learning Platform for AI, providing accessible and scalable solutions.\nAll the Chinese diffusion model checkpoints, LoRAs, and ControlNets, including\ndomain-specific ones, are publicly available. A user-friendly Chinese WebUI and\nthe diffusers-api elastic inference toolkit, also open-sourced, further\nfacilitate the easy deployment of PAI-Diffusion models in various environments,\nmaking it a valuable resource for Chinese text-to-image synthesis.\n","authors":["Chengyu Wang","Zhongjie Duan","Bingyan Liu","Xinyi Zou","Cen Chen","Kui Jia","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2309.05534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05528v1","updated":"2023-09-11T15:12:05Z","published":"2023-09-11T15:12:05Z","title":"On the detection of Out-Of-Distribution samples in Multiple Instance\n Learning","summary":" The deployment of machine learning solutions in real-world scenarios often\ninvolves addressing the challenge of out-of-distribution (OOD) detection. While\nsignificant efforts have been devoted to OOD detection in classical supervised\nsettings, the context of weakly supervised learning, particularly the Multiple\nInstance Learning (MIL) framework, remains under-explored. In this study, we\ntackle this challenge by adapting post-hoc OOD detection methods to the MIL\nsetting while introducing a novel benchmark specifically designed to assess OOD\ndetection performance in weakly supervised scenarios. Extensive experiments\nbased on diverse public datasets do not reveal a single method with a clear\nadvantage over the others. Although DICE emerges as the best-performing method\noverall, it exhibits significant shortcomings on some datasets, emphasizing the\ncomplexity of this under-explored and challenging topic. Our findings shed\nlight on the complex nature of OOD detection under the MIL framework,\nemphasizing the importance of developing novel, robust, and reliable methods\nthat can generalize effectively in a weakly supervised context. The code for\nthe paper is available here: https://github.com/loic-lb/OOD_MIL.\n","authors":["Loïc Le Bescond","Maria Vakalopoulou","Stergios Christodoulidis","Fabrice André","Hugues Talbot"],"pdf_url":"https://arxiv.org/pdf/2309.05528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05527v1","updated":"2023-09-11T15:11:11Z","published":"2023-09-11T15:11:11Z","title":"ReSimAD: Zero-Shot 3D Domain Transfer for Autonomous Driving with Source\n Reconstruction and Target Simulation","summary":" Domain shifts such as sensor type changes and geographical situation\nvariations are prevalent in Autonomous Driving (AD), which poses a challenge\nsince AD model relying on the previous-domain knowledge can be hardly directly\ndeployed to a new domain without additional costs. In this paper, we provide a\nnew perspective and approach of alleviating the domain shifts, by proposing a\nReconstruction-Simulation-Perception (ReSimAD) scheme. Specifically, the\nimplicit reconstruction process is based on the knowledge from the previous old\ndomain, aiming to convert the domain-related knowledge into domain-invariant\nrepresentations, \\textit{e.g.}, 3D scene-level meshes. Besides, the point\nclouds simulation process of multiple new domains is conditioned on the above\nreconstructed 3D meshes, where the target-domain-like simulation samples can be\nobtained, thus reducing the cost of collecting and annotating new-domain data\nfor the subsequent perception process. For experiments, we consider different\ncross-domain situations such as Waymo-to-KITTI, Waymo-to-nuScenes,\nWaymo-to-ONCE, \\textit{etc}, to verify the \\textbf{zero-shot} target-domain\nperception using ReSimAD. Results demonstrate that our method is beneficial to\nboost the domain generalization ability, even promising for 3D pre-training.\n","authors":["Bo Zhang","Xinyu Cai","Jiakang Yuan","Donglin Yang","Jianfei Guo","Renqiu Xia","Botian Shi","Min Dou","Tao Chen","Si Liu","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.05527v1.pdf","comment":"Code and simulated points are available at\n https://github.com/PJLab-ADG/3DTrans\\#resimad"},{"id":"http://arxiv.org/abs/2304.06619v2","updated":"2023-09-11T15:02:00Z","published":"2023-04-13T15:40:41Z","title":"Class-Incremental Learning of Plant and Disease Detection: Growing\n Branches with Knowledge Distillation","summary":" This paper investigates the problem of class-incremental object detection for\nagricultural applications where a model needs to learn new plant species and\ndiseases incrementally without forgetting the previously learned ones. We adapt\ntwo public datasets to include new categories over time, simulating a more\nrealistic and dynamic scenario. We then compare three class-incremental\nlearning methods that leverage different forms of knowledge distillation to\nmitigate catastrophic forgetting. Our experiments show that all three methods\nsuffer from catastrophic forgetting, but the Dynamic Y-KD approach, which\nadditionally uses a dynamic architecture that grows new branches to learn new\ntasks, outperforms ILOD and Faster-ILOD in most settings both on new and old\nclasses.\n These results highlight the challenges and opportunities of continual object\ndetection for agricultural applications. In particular, we hypothesize that the\nlarge intra-class and small inter-class variability that is typical of plant\nimages exacerbate the difficulty of learning new categories without interfering\nwith previous knowledge. We publicly release our code to encourage future work.\n","authors":["Mathieu Pagé Fortin"],"pdf_url":"https://arxiv.org/pdf/2304.06619v2.pdf","comment":"Accepted at CVPPA'23"},{"id":"http://arxiv.org/abs/2309.05517v1","updated":"2023-09-11T15:00:01Z","published":"2023-09-11T15:00:01Z","title":"Stream-based Active Learning by Exploiting Temporal Properties in\n Perception with Temporal Predicted Loss","summary":" Active learning (AL) reduces the amount of labeled data needed to train a\nmachine learning model by intelligently choosing which instances to label.\nClassic pool-based AL requires all data to be present in a datacenter, which\ncan be challenging with the increasing amounts of data needed in deep learning.\nHowever, AL on mobile devices and robots, like autonomous cars, can filter the\ndata from perception sensor streams before reaching the datacenter. We\nexploited the temporal properties for such image streams in our work and\nproposed the novel temporal predicted loss (TPL) method. To evaluate the\nstream-based setting properly, we introduced the GTA V streets and the A2D2\nstreets dataset and made both publicly available. Our experiments showed that\nour approach significantly improves the diversity of the selection while being\nan uncertainty-based method. As pool-based approaches are more common in\nperception applications, we derived a concept for comparing pool-based and\nstream-based AL, where TPL out-performed state-of-the-art pool- or stream-based\napproaches for different models. TPL demonstrated a gain of 2.5 precept points\n(pp) less required data while being significantly faster than pool-based\nmethods.\n","authors":["Sebastian Schmidt","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2309.05517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05499v1","updated":"2023-09-11T14:42:04Z","published":"2023-09-11T14:42:04Z","title":"Zero-Shot Co-salient Object Detection Framework","summary":" Co-salient Object Detection (CoSOD) endeavors to replicate the human visual\nsystem's capacity to recognize common and salient objects within a collection\nof images. Despite recent advancements in deep learning models, these models\nstill rely on training with well-annotated CoSOD datasets. The exploration of\ntraining-free zero-shot CoSOD frameworks has been limited. In this paper,\ntaking inspiration from the zero-shot transfer capabilities of foundational\ncomputer vision models, we introduce the first zero-shot CoSOD framework that\nharnesses these models without any training process. To achieve this, we\nintroduce two novel components in our proposed framework: the group prompt\ngeneration (GPG) module and the co-saliency map generation (CMP) module. We\nevaluate the framework's performance on widely-used datasets and observe\nimpressive results. Our approach surpasses existing unsupervised methods and\neven outperforms fully supervised methods developed before 2020, while\nremaining competitive with some fully supervised methods developed before 2022.\n","authors":["Haoke Xiao","Lv Tang","Bo Li","Zhiming Luo","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2309.05499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00831v2","updated":"2023-09-11T14:34:19Z","published":"2023-09-02T05:33:31Z","title":"Multi-scale, Data-driven and Anatomically Constrained Deep Learning\n Image Registration for Adult and Fetal Echocardiography","summary":" Temporal echocardiography image registration is a basis for clinical\nquantifications such as cardiac motion estimation, myocardial strain\nassessments, and stroke volume quantifications. In past studies, deep learning\nimage registration (DLIR) has shown promising results and is consistently\naccurate and precise, requiring less computational time. We propose that a\ngreater focus on the warped moving image's anatomic plausibility and image\nquality can support robust DLIR performance. Further, past implementations have\nfocused on adult echocardiography, and there is an absence of DLIR\nimplementations for fetal echocardiography. We propose a framework that\ncombines three strategies for DLIR in both fetal and adult echo: (1) an\nanatomic shape-encoded loss to preserve physiological myocardial and left\nventricular anatomical topologies in warped images; (2) a data-driven loss that\nis trained adversarially to preserve good image texture features in warped\nimages; and (3) a multi-scale training scheme of a data-driven and anatomically\nconstrained algorithm to improve accuracy. Our tests show that good anatomical\ntopology and image textures are strongly linked to shape-encoded and\ndata-driven adversarial losses. They improve different aspects of registration\nperformance in a non-overlapping way, justifying their combination. Despite\nfundamental distinctions between adult and fetal echo images, we show that\nthese strategies can provide excellent registration results in both adult and\nfetal echocardiography using the publicly available CAMUS adult echo dataset\nand our private multi-demographic fetal echo dataset. Our approach outperforms\ntraditional non-DL gold standard registration approaches, including Optical\nFlow and Elastix. Registration improvements could be translated to more\naccurate and precise clinical quantification of cardiac ejection fraction,\ndemonstrating a potential for translation.\n","authors":["Md. Kamrul Hasan","Haobo Zhu","Guang Yang","Choon Hwai Yap"],"pdf_url":"https://arxiv.org/pdf/2309.00831v2.pdf","comment":"Our data-driven and anatomically constrained DLIR method's source\n code will be publicly available at https://github.com/kamruleee51/DdC-AC-DLIR"},{"id":"http://arxiv.org/abs/2309.05490v1","updated":"2023-09-11T14:32:04Z","published":"2023-09-11T14:32:04Z","title":"Learning Semantic Segmentation with Query Points Supervision on Aerial\n Images","summary":" Semantic segmentation is crucial in remote sensing, where high-resolution\nsatellite images are segmented into meaningful regions. Recent advancements in\ndeep learning have significantly improved satellite image segmentation.\nHowever, most of these methods are typically trained in fully supervised\nsettings that require high-quality pixel-level annotations, which are expensive\nand time-consuming to obtain. In this work, we present a weakly supervised\nlearning algorithm to train semantic segmentation algorithms that only rely on\nquery point annotations instead of full mask labels. Our proposed approach\nperforms accurate semantic segmentation and improves efficiency by\nsignificantly reducing the cost and time required for manual annotation.\nSpecifically, we generate superpixels and extend the query point labels into\nthose superpixels that group similar meaningful semantics. Then, we train\nsemantic segmentation models, supervised with images partially labeled with the\nsuperpixels pseudo-labels. We benchmark our weakly supervised training approach\non an aerial image dataset and different semantic segmentation architectures,\nshowing that we can reach competitive performance compared to fully supervised\ntraining while reducing the annotation effort.\n","authors":["Santiago Rivier","Carlos Hinojosa","Silvio Giancola","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2309.05490v1.pdf","comment":"Paper presented at the LXCV workshop at ICCV 2023"},{"id":"http://arxiv.org/abs/2212.09993v6","updated":"2023-09-11T13:58:44Z","published":"2022-12-20T04:33:32Z","title":"Are Deep Neural Networks SMARTer than Second Graders?","summary":" Recent times have witnessed an increasing number of applications of deep\nneural networks towards solving tasks that require superior cognitive\nabilities, e.g., playing Go, generating art, ChatGPT, etc. Such a dramatic\nprogress raises the question: how generalizable are neural networks in solving\nproblems that demand broad skills? To answer this question, we propose SMART: a\nSimple Multimodal Algorithmic Reasoning Task and the associated SMART-101\ndataset, for evaluating the abstraction, deduction, and generalization\nabilities of neural networks in solving visuo-linguistic puzzles designed\nspecifically for children in the 6--8 age group. Our dataset consists of 101\nunique puzzles; each puzzle comprises a picture and a question, and their\nsolution needs a mix of several elementary skills, including arithmetic,\nalgebra, and spatial reasoning, among others. To scale our dataset towards\ntraining deep neural networks, we programmatically generate entirely new\ninstances for each puzzle, while retaining their solution algorithm. To\nbenchmark performances on SMART-101, we propose a vision and language\nmeta-learning model using varied state-of-the-art backbones. Our experiments\nreveal that while powerful deep models offer reasonable performances on puzzles\nin a supervised setting, they are not better than random accuracy when analyzed\nfor generalization. We also evaluate the recent ChatGPT and other large\nlanguage models on a subset of SMART-101 and find that while these models show\nconvincing reasoning abilities, the answers are often incorrect.\n","authors":["Anoop Cherian","Kuan-Chuan Peng","Suhas Lohit","Kevin A. Smith","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2212.09993v6.pdf","comment":"Extended version of CVPR 2023 paper. For the SMART-101 dataset, see\n http://smartdataset.github.io/smart101"},{"id":"http://arxiv.org/abs/2309.05451v1","updated":"2023-09-11T13:44:46Z","published":"2023-09-11T13:44:46Z","title":"Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal\n Retrieval","summary":" Current research on cross-modal retrieval is mostly English-oriented, as the\navailability of a large number of English-oriented human-labeled\nvision-language corpora. In order to break the limit of non-English labeled\ndata, cross-lingual cross-modal retrieval (CCR) has attracted increasing\nattention. Most CCR methods construct pseudo-parallel vision-language corpora\nvia Machine Translation (MT) to achieve cross-lingual transfer. However, the\ntranslated sentences from MT are generally imperfect in describing the\ncorresponding visual contents. Improperly assuming the pseudo-parallel data are\ncorrectly correlated will make the networks overfit to the noisy\ncorrespondence. Therefore, we propose Dual-view Curricular Optimal Transport\n(DCOT) to learn with noisy correspondence in CCR. In particular, we quantify\nthe confidence of the sample pair correlation with optimal transport theory\nfrom both the cross-lingual and cross-modal views, and design dual-view\ncurriculum learning to dynamically model the transportation costs according to\nthe learning stage of the two views. Extensive experiments are conducted on two\nmultilingual image-text datasets and one video-text dataset, and the results\ndemonstrate the effectiveness and robustness of the proposed method. Besides,\nour proposed method also shows a good expansibility to cross-lingual image-text\nbaselines and a decent generalization on out-of-domain data.\n","authors":["Yabing Wang","Shuhui Wang","Hao Luo","Jianfeng Dong","Fan Wang","Meng Han","Xun Wang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05448v1","updated":"2023-09-11T13:41:27Z","published":"2023-09-11T13:41:27Z","title":"Panoptic Vision-Language Feature Fields","summary":" Recently, methods have been proposed for 3D open-vocabulary semantic\nsegmentation. Such methods are able to segment scenes into arbitrary classes\ngiven at run-time using their text description. In this paper, we propose to\nour knowledge the first algorithm for open-vocabulary panoptic segmentation,\nsimultaneously performing both semantic and instance segmentation. Our\nalgorithm, Panoptic Vision-Language Feature Fields (PVLFF) learns a feature\nfield of the scene, jointly learning vision-language features and hierarchical\ninstance features through a contrastive loss function from 2D instance segment\nproposals on input frames. Our method achieves comparable performance against\nthe state-of-the-art close-set 3D panoptic systems on the HyperSim, ScanNet and\nReplica dataset and outperforms current 3D open-vocabulary systems in terms of\nsemantic segmentation. We additionally ablate our method to demonstrate the\neffectiveness of our model architecture. Our code will be available at\nhttps://github.com/ethz-asl/autolabel.\n","authors":["Haoran Chen","Kenneth Blomqvist","Francesco Milano","Roland Siegwart"],"pdf_url":"https://arxiv.org/pdf/2309.05448v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.05446v1","updated":"2023-09-11T13:39:15Z","published":"2023-09-11T13:39:15Z","title":"A Localization-to-Segmentation Framework for Automatic Tumor\n Segmentation in Whole-Body PET/CT Images","summary":" Fluorodeoxyglucose (FDG) positron emission tomography(PET) combined with\ncomputed tomography (CT) is considered the primary solution for detecting some\ncancers, such as lung cancer and melanoma. Automatic segmentation of tumors in\nPET/CT images can help reduce doctors' workload, thereby improving diagnostic\nquality. However, precise tumor segmentation is challenging due to the small\nsize of many tumors and the similarity of high-uptake normal areas to the tumor\nregions. To address these issues, this paper proposes a\nlocalization-to-segmentation framework (L2SNet) for precise tumor segmentation.\nL2SNet first localizes the possible lesions in the lesion localization phase\nand then uses the location cues to shape the segmentation results in the lesion\nsegmentation phase. To further improve the segmentation performance of L2SNet,\nwe design an adaptive threshold scheme that takes the segmentation results of\nthe two phases into consideration. The experiments with the MICCAI 2023\nAutomated Lesion Segmentation in Whole-Body FDG-PET/CT challenge dataset show\nthat our method achieved a competitive result and was ranked in the top 7\nmethods on the preliminary test set. Our work is available at:\nhttps://github.com/MedCAI/L2SNet.\n","authors":["Linghan Cai","Jianhao Huang","Zihang Zhu","Jinpeng Lu","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05446v1.pdf","comment":"7 pages,3 figures"},{"id":"http://arxiv.org/abs/2309.05438v1","updated":"2023-09-11T13:21:26Z","published":"2023-09-11T13:21:26Z","title":"Towards Content-based Pixel Retrieval in Revisited Oxford and Paris","summary":" This paper introduces the first two pixel retrieval benchmarks. Pixel\nretrieval is segmented instance retrieval. Like semantic segmentation extends\nclassification to the pixel level, pixel retrieval is an extension of image\nretrieval and offers information about which pixels are related to the query\nobject. In addition to retrieving images for the given query, it helps users\nquickly identify the query object in true positive images and exclude false\npositive images by denoting the correlated pixels. Our user study results show\npixel-level annotation can significantly improve the user experience.\n Compared with semantic and instance segmentation, pixel retrieval requires a\nfine-grained recognition capability for variable-granularity targets. To this\nend, we propose pixel retrieval benchmarks named PROxford and PRParis, which\nare based on the widely used image retrieval datasets, ROxford and RParis.\nThree professional annotators label 5,942 images with two rounds of\ndouble-checking and refinement. Furthermore, we conduct extensive experiments\nand analysis on the SOTA methods in image search, image matching, detection,\nsegmentation, and dense matching using our pixel retrieval benchmarks. Results\nshow that the pixel retrieval task is challenging to these approaches and\ndistinctive from existing problems, suggesting that further research can\nadvance the content-based pixel-retrieval and thus user search experience. The\ndatasets can be downloaded from\n\\href{https://github.com/anguoyuan/Pixel_retrieval-Segmented_instance_retrieval}{this\nlink}.\n","authors":["Guoyuan An","Woo Jae Kim","Saelyne Yang","Rong Li","Yuchi Huo","Sung-Eui Yoon"],"pdf_url":"https://arxiv.org/pdf/2309.05438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11974v2","updated":"2023-09-11T13:18:55Z","published":"2023-08-23T07:46:44Z","title":"Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields","summary":" Text-driven localized editing of 3D objects is particularly difficult as\nlocally mixing the original 3D object with the intended new object and style\neffects without distorting the object's form is not a straightforward process.\nTo address this issue, we propose a novel NeRF-based model, Blending-NeRF,\nwhich consists of two NeRF networks: pretrained NeRF and editable NeRF.\nAdditionally, we introduce new blending operations that allow Blending-NeRF to\nproperly edit target regions which are localized by text. By using a pretrained\nvision-language aligned model, CLIP, we guide Blending-NeRF to add new objects\nwith varying colors and densities, modify textures, and remove parts of the\noriginal object. Our extensive experiments demonstrate that Blending-NeRF\nproduces naturally and locally edited 3D objects from various text prompts. Our\nproject page is available at https://seokhunchoi.github.io/Blending-NeRF/\n","authors":["Hyeonseop Song","Seokhun Choi","Hoseok Do","Chul Lee","Taehyeong Kim"],"pdf_url":"https://arxiv.org/pdf/2308.11974v2.pdf","comment":"Accepted to ICCV 2023. The first two authors contributed equally to\n this work"},{"id":"http://arxiv.org/abs/2212.00330v3","updated":"2023-09-11T12:51:11Z","published":"2022-12-01T07:32:56Z","title":"Reliable Joint Segmentation of Retinal Edema Lesions in OCT Images","summary":" Focusing on the complicated pathological features, such as blurred\nboundaries, severe scale differences between symptoms, background noise\ninterference, etc., in the task of retinal edema lesions joint segmentation\nfrom OCT images and enabling the segmentation results more reliable. In this\npaper, we propose a novel reliable multi-scale wavelet-enhanced transformer\nnetwork, which can provide accurate segmentation results with reliability\nassessment. Specifically, aiming at improving the model's ability to learn the\ncomplex pathological features of retinal edema lesions in OCT images, we\ndevelop a novel segmentation backbone that integrates a wavelet-enhanced\nfeature extractor network and a multi-scale transformer module of our newly\ndesigned. Meanwhile, to make the segmentation results more reliable, a novel\nuncertainty segmentation head based on the subjective logical evidential theory\nis introduced to generate the final segmentation results with a corresponding\noverall uncertainty evaluation score map. We conduct comprehensive experiments\non the public database of AI-Challenge 2018 for retinal edema lesions\nsegmentation, and the results show that our proposed method achieves better\nsegmentation accuracy with a high degree of reliability as compared to other\nstate-of-the-art segmentation approaches. The code will be released on:\nhttps://github.com/LooKing9218/ReliableRESeg.\n","authors":["Meng Wang","Kai Yu","Chun-Mei Feng","Ke Zou","Yanyu Xu","Qingquan Meng","Rick Siow Mong Goh","Yong Liu","Xinxing Xu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2212.00330v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05418v1","updated":"2023-09-11T12:35:17Z","published":"2023-09-11T12:35:17Z","title":"FlowIBR: Leveraging Pre-Training for Efficient Neural Image-Based\n Rendering of Dynamic Scenes","summary":" We introduce a novel approach for monocular novel view synthesis of dynamic\nscenes. Existing techniques already show impressive rendering quality but tend\nto focus on optimization within a single scene without leveraging prior\nknowledge. This limitation has been primarily attributed to the lack of\ndatasets of dynamic scenes available for training and the diversity of scene\ndynamics. Our method FlowIBR circumvents these issues by integrating a neural\nimage-based rendering method, pre-trained on a large corpus of widely available\nstatic scenes, with a per-scene optimized scene flow field. Utilizing this flow\nfield, we bend the camera rays to counteract the scene dynamics, thereby\npresenting the dynamic scene as if it were static to the rendering network. The\nproposed method reduces per-scene optimization time by an order of magnitude,\nachieving comparable results to existing methods - all on a single\nconsumer-grade GPU.\n","authors":["Marcel Büsching","Josef Bengtson","David Nilsson","Mårten Björkman"],"pdf_url":"https://arxiv.org/pdf/2309.05418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05406v1","updated":"2023-09-11T12:12:52Z","published":"2023-09-11T12:12:52Z","title":"Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI\n Generation and Diffuse Glioma Growth Prediction","summary":" Diffuse gliomas are malignant brain tumors that grow widespread through the\nbrain. The complex interactions between neoplastic cells and normal tissue, as\nwell as the treatment-induced changes often encountered, make glioma tumor\ngrowth modeling challenging. In this paper, we present a novel end-to-end\nnetwork capable of generating future tumor masks and realistic MRIs of how the\ntumor will look at any future time points for different treatment plans. Our\nmodel is built upon cutting-edge diffusion probabilistic models and\ndeep-segmentation neural networks. We extended a diffusion model to include\nsequential multi-parametric MRI and treatment information as conditioning input\nto guide the generative diffusion process. This allows us to estimate tumor\ngrowth at any given time point. We trained the model using real-world\npostoperative longitudinal MRI data with glioma tumor growth trajectories\nrepresented as tumor segmentation maps over time. The model has demonstrated\npromising performance across a range of tasks, including the generation of\nhigh-quality synthetic MRIs with tumor masks, time-series tumor segmentations,\nand uncertainty estimation. Combined with the treatment-aware generated MRIs,\nthe tumor growth predictions with uncertainty estimates can provide useful\ninformation for clinical decision-making.\n","authors":["Qinghui Liu","Elies Fuster-Garcia","Ivar Thokle Hovden","Donatas Sederevicius","Karoline Skogen","Bradley J MacIntosh","Edvard Grødem","Till Schellhorn","Petter Brandal","Atle Bjørnerud","Kyrre Eeg Emblem"],"pdf_url":"https://arxiv.org/pdf/2309.05406v1.pdf","comment":"13 pages, 10 figures, 2 tables, 2 agls, pre-print-v1"},{"id":"http://arxiv.org/abs/2309.05405v1","updated":"2023-09-11T12:12:25Z","published":"2023-09-11T12:12:25Z","title":"Two-Stage Hybrid Supervision Framework for Fast, Low-resource, and\n Accurate Organ and Pan-cancer Segmentation in Abdomen CT","summary":" Abdominal organ and tumour segmentation has many important clinical\napplications, such as organ quantification, surgical planning, and disease\ndiagnosis. However, manual assessment is inherently subjective with\nconsiderable inter- and intra-expert variability. In the paper, we propose a\nhybrid supervised framework, StMt, that integrates self-training and mean\nteacher for the segmentation of abdominal organs and tumors using partially\nlabeled and unlabeled data. We introduce a two-stage segmentation pipeline and\nwhole-volume-based input strategy to maximize segmentation accuracy while\nmeeting the requirements of inference time and GPU memory usage. Experiments on\nthe validation set of FLARE2023 demonstrate that our method achieves excellent\nsegmentation performance as well as fast and low-resource model inference. Our\nmethod achieved an average DSC score of 89.79\\% and 45.55 \\% for the organs and\nlesions on the validation set and the average running time and area under GPU\nmemory-time cure are 11.25s and 9627.82MB, respectively.\n","authors":["Wentao Liu","Tong Tian","Weijin Xu","Lemeng Wang","Haoyuan Li","Huihua Yang"],"pdf_url":"https://arxiv.org/pdf/2309.05405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14049v2","updated":"2023-09-11T11:54:14Z","published":"2022-11-25T12:09:12Z","title":"Task-Oriented Communication for Edge Video Analytics","summary":" With the development of artificial intelligence (AI) techniques and the\nincreasing popularity of camera-equipped devices, many edge video analytics\napplications are emerging, calling for the deployment of computation-intensive\nAI models at the network edge. Edge inference is a promising solution to move\nthe computation-intensive workloads from low-end devices to a powerful edge\nserver for video analytics, but the device-server communications will remain a\nbottleneck due to the limited bandwidth. This paper proposes a task-oriented\ncommunication framework for edge video analytics, where multiple devices\ncollect the visual sensory data and transmit the informative features to an\nedge server for processing. To enable low-latency inference, this framework\nremoves video redundancy in spatial and temporal domains and transmits minimal\ninformation that is essential for the downstream task, rather than\nreconstructing the videos at the edge server. Specifically, it extracts compact\ntask-relevant features based on the deterministic information bottleneck (IB)\nprinciple, which characterizes a tradeoff between the informativeness of the\nfeatures and the communication cost. As the features of consecutive frames are\ntemporally correlated, we propose a temporal entropy model (TEM) to reduce the\nbitrate by taking the previous features as side information in feature\nencoding. To further improve the inference performance, we build a\nspatial-temporal fusion module at the server to integrate features of the\ncurrent and previous frames for joint inference. Extensive experiments on video\nanalytics tasks evidence that the proposed framework effectively encodes\ntask-relevant information of video data and achieves a better rate-performance\ntradeoff than existing methods.\n","authors":["Jiawei Shao","Xinjie Zhang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.14049v2.pdf","comment":"This paper was accepted to IEEE Transactions on Wireless\n Communications (TWC)"},{"id":"http://arxiv.org/abs/2309.05388v1","updated":"2023-09-11T11:35:17Z","published":"2023-09-11T11:35:17Z","title":"Robust Single Rotation Averaging Revisited","summary":" In this work, we propose a novel method for robust single rotation averaging\nthat can efficiently handle an extremely large fraction of outliers. Our\napproach is to minimize the total truncated least unsquared deviations (TLUD)\ncost of geodesic distances. The proposed algorithm consists of three steps:\nFirst, we consider each input rotation as a potential initial solution and\nchoose the one that yields the least sum of truncated chordal deviations. Next,\nwe obtain the inlier set using the initial solution and compute its chordal\n$L_2$-mean. Finally, starting from this estimate, we iteratively compute the\ngeodesic $L_1$-mean of the inliers using the Weiszfeld algorithm on $SO(3)$. An\nextensive evaluation shows that our method is robust against up to 99% outliers\ngiven a sufficient number of accurate inliers, outperforming the current state\nof the art.\n","authors":["Seong Hun Lee","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2309.05388v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.01545v2","updated":"2023-09-11T11:33:58Z","published":"2022-07-04T16:13:27Z","title":"Masked Autoencoders in 3D Point Cloud Representation Learning","summary":" Transformer-based Self-supervised Representation Learning methods learn\ngeneric features from unlabeled datasets for providing useful network\ninitialization parameters for downstream tasks. Recently, self-supervised\nlearning based upon masking local surface patches for 3D point cloud data has\nbeen under-explored. In this paper, we propose masked Autoencoders in 3D point\ncloud representation learning (abbreviated as MAE3D), a novel autoencoding\nparadigm for self-supervised learning. We first split the input point cloud\ninto patches and mask a portion of them, then use our Patch Embedding Module to\nextract the features of unmasked patches. Secondly, we employ patch-wise MAE3D\nTransformers to learn both local features of point cloud patches and high-level\ncontextual relationships between patches and complete the latent\nrepresentations of masked patches. We use our Point Cloud Reconstruction Module\nwith multi-task loss to complete the incomplete point cloud as a result. We\nconduct self-supervised pre-training on ShapeNet55 with the point cloud\ncompletion pre-text task and fine-tune the pre-trained model on ModelNet40 and\nScanObjectNN (PB\\_T50\\_RS, the hardest variant). Comprehensive experiments\ndemonstrate that the local features extracted by our MAE3D from point cloud\npatches are beneficial for downstream classification tasks, soundly\noutperforming state-of-the-art methods ($93.4\\%$ and $86.2\\%$ classification\naccuracy, respectively).\n","authors":["Jincen Jiang","Xuequan Lu","Lizhi Zhao","Richard Dazeley","Meili Wang"],"pdf_url":"https://arxiv.org/pdf/2207.01545v2.pdf","comment":"Accepted to IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2304.04321v2","updated":"2023-09-11T11:27:53Z","published":"2023-04-09T21:42:57Z","title":"ARNOLD: A Benchmark for Language-Grounded Task Learning With Continuous\n States in Realistic 3D Scenes","summary":" Understanding the continuous states of objects is essential for task learning\nand planning in the real world. However, most existing task learning benchmarks\nassume discrete (e.g., binary) object goal states, which poses challenges for\nthe learning of complex tasks and transferring learned policy from simulated\nenvironments to the real world. Furthermore, state discretization limits a\nrobot's ability to follow human instructions based on the grounding of actions\nand states. To tackle these challenges, we present ARNOLD, a benchmark that\nevaluates language-grounded task learning with continuous states in realistic\n3D scenes. ARNOLD is comprised of 8 language-conditioned tasks that involve\nunderstanding object states and learning policies for continuous goals. To\npromote language-instructed learning, we provide expert demonstrations with\ntemplate-generated language descriptions. We assess task performance by\nutilizing the latest language-conditioned policy learning models. Our results\nindicate that current models for language-conditioned manipulations continue to\nexperience significant challenges in novel goal-state generalizations, scene\ngeneralizations, and object generalizations. These findings highlight the need\nto develop new algorithms that address this gap and underscore the potential\nfor further research in this area. Project website:\nhttps://arnold-benchmark.github.io.\n","authors":["Ran Gong","Jiangyong Huang","Yizhou Zhao","Haoran Geng","Xiaofeng Gao","Qingyang Wu","Wensi Ai","Ziheng Zhou","Demetri Terzopoulos","Song-Chun Zhu","Baoxiong Jia","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2304.04321v2.pdf","comment":"The first two authors contributed equally; 20 pages; 17 figures;\n project availalbe: https://arnold-benchmark.github.io/ ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05380v1","updated":"2023-09-11T11:04:31Z","published":"2023-09-11T11:04:31Z","title":"Collective PV-RCNN: A Novel Fusion Technique using Collective Detections\n for Enhanced Local LiDAR-Based Perception","summary":" Comprehensive perception of the environment is crucial for the safe operation\nof autonomous vehicles. However, the perception capabilities of autonomous\nvehicles are limited due to occlusions, limited sensor ranges, or environmental\ninfluences. Collective Perception (CP) aims to mitigate these problems by\nenabling the exchange of information between vehicles. A major challenge in CP\nis the fusion of the exchanged information. Due to the enormous bandwidth\nrequirement of early fusion approaches and the interchangeability issues of\nintermediate fusion approaches, only the late fusion of shared detections is\npractical. Current late fusion approaches neglect valuable information for\nlocal detection, this is why we propose a novel fusion method to fuse the\ndetections of cooperative vehicles within the local LiDAR-based detection\npipeline. Therefore, we present Collective PV-RCNN (CPV-RCNN), which extends\nthe PV-RCNN++ framework to fuse collective detections. Code is available at\nhttps://github.com/ekut-es\n","authors":["Sven Teufel","Jörg Gamerdinger","Georg Volk","Oliver Bringmann"],"pdf_url":"https://arxiv.org/pdf/2309.05380v1.pdf","comment":"accepted at IEEE ITSC 2023"},{"id":"http://arxiv.org/abs/2309.05375v1","updated":"2023-09-11T10:54:22Z","published":"2023-09-11T10:54:22Z","title":"CNN or ViT? Revisiting Vision Transformers Through the Lens of\n Convolution","summary":" The success of Vision Transformer (ViT) has been widely reported on a wide\nrange of image recognition tasks. The merit of ViT over CNN has been largely\nattributed to large training datasets or auxiliary pre-training. Without\npre-training, the performance of ViT on small datasets is limited because the\nglobal self-attention has limited capacity in local modeling. Towards boosting\nViT on small datasets without pre-training, this work improves its local\nmodeling by applying a weight mask on the original self-attention matrix. A\nstraightforward way to locally adapt the self-attention matrix can be realized\nby an element-wise learnable weight mask (ELM), for which our preliminary\nresults show promising results. However, the element-wise simple learnable\nweight mask not only induces a non-trivial additional parameter overhead but\nalso increases the optimization complexity. To this end, this work proposes a\nnovel Gaussian mixture mask (GMM) in which one mask only has two learnable\nparameters and it can be conveniently used in any ViT variants whose attention\nmechanism allows the use of masks. Experimental results on multiple small\ndatasets demonstrate that the effectiveness of our proposed Gaussian mask for\nboosting ViTs for free (almost zero additional parameter or computation cost).\nOur code will be publicly available at\n\\href{https://github.com/CatworldLee/Gaussian-Mixture-Mask-Attention}{https://github.com/CatworldLee/Gaussian-Mixture-Mask-Attention}.\n","authors":["Chenghao Li","Chaoning Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08455v3","updated":"2023-09-11T10:36:41Z","published":"2023-05-15T08:54:32Z","title":"Document Understanding Dataset and Evaluation (DUDE)","summary":" We call on the Document AI (DocAI) community to reevaluate current\nmethodologies and embrace the challenge of creating more practically-oriented\nbenchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to\nremediate the halted research progress in understanding visually-rich documents\n(VRDs). We present a new dataset with novelties related to types of questions,\nanswers, and document layouts based on multi-industry, multi-domain, and\nmulti-page VRDs of various origins, and dates. Moreover, we are pushing the\nboundaries of current methods by creating multi-task and multi-domain\nevaluation setups that more accurately simulate real-world situations where\npowerful generalization and adaptation under low-resource settings are desired.\nDUDE aims to set a new standard as a more practical, long-standing benchmark\nfor the community, and we hope that it will lead to future extensions and\ncontributions that address real-world challenges. Finally, our work illustrates\nthe importance of finding more efficient ways to model language, images, and\nlayout in DocAI.\n","authors":["Jordy Van Landeghem","Rubén Tito","Łukasz Borchmann","Michał Pietruszka","Paweł Józiak","Rafał Powalski","Dawid Jurkiewicz","Mickaël Coustaty","Bertrand Ackaert","Ernest Valveny","Matthew Blaschko","Sien Moens","Tomasz Stanisławek"],"pdf_url":"https://arxiv.org/pdf/2305.08455v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2112.00234v3","updated":"2023-09-11T10:13:21Z","published":"2021-12-01T02:10:42Z","title":"MC-Blur: A Comprehensive Benchmark for Image Deblurring","summary":" Blur artifacts can seriously degrade the visual quality of images, and\nnumerous deblurring methods have been proposed for specific scenarios. However,\nin most real-world images, blur is caused by different factors, e.g., motion\nand defocus. In this paper, we address how different deblurring methods perform\nin the case of multiple types of blur. For in-depth performance evaluation, we\nconstruct a new large-scale multi-cause image deblurring dataset (called\nMC-Blur), including real-world and synthesized blurry images with mixed factors\nof blurs. The images in the proposed MC-Blur dataset are collected using\ndifferent techniques: averaging sharp images captured by a 1000-fps high-speed\ncamera, convolving Ultra-High-Definition (UHD) sharp images with large-size\nkernels, adding defocus to images, and real-world blurry images captured by\nvarious camera models. Based on the MC-Blur dataset, we conduct extensive\nbenchmarking studies to compare SOTA methods in different scenarios, analyze\ntheir efficiency, and investigate the built dataset's capacity. These\nbenchmarking results provide a comprehensive overview of the advantages and\nlimitations of current deblurring methods, and reveal the advances of our\ndataset.\n","authors":["Kaihao Zhang","Tao Wang","Wenhan Luo","Boheng Chen","Wenqi Ren","Bjorn Stenger","Wei Liu","Hongdong Li","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2112.00234v3.pdf","comment":"To appear in IEEE TCSVT"},{"id":"http://arxiv.org/abs/2205.01694v3","updated":"2023-09-11T10:06:19Z","published":"2022-05-03T18:00:01Z","title":"End2End Multi-View Feature Matching with Differentiable Pose\n Optimization","summary":" Erroneous feature matches have severe impact on subsequent camera pose\nestimation and often require additional, time-costly measures, like RANSAC, for\noutlier rejection. Our method tackles this challenge by addressing feature\nmatching and pose optimization jointly. To this end, we propose a graph\nattention network to predict image correspondences along with confidence\nweights. The resulting matches serve as weighted constraints in a\ndifferentiable pose estimation. Training feature matching with gradients from\npose optimization naturally learns to down-weight outliers and boosts pose\nestimation on image pairs compared to SuperGlue by 6.7% on ScanNet. At the same\ntime, it reduces the pose estimation time by over 50% and renders RANSAC\niterations unnecessary. Moreover, we integrate information from multiple views\nby spanning the graph across multiple frames to predict the matches all at\nonce. Multi-view matching combined with end-to-end training improves the pose\nestimation metrics on Matterport3D by 18.5% compared to SuperGlue.\n","authors":["Barbara Roessle","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2205.01694v3.pdf","comment":"ICCV 2023, project page:\n https://barbararoessle.github.io/e2e_multi_view_matching , video:\n https://youtu.be/uuLb6GfM9Cg"},{"id":"http://arxiv.org/abs/2212.01173v2","updated":"2023-09-11T09:54:18Z","published":"2022-12-02T13:55:41Z","title":"DWRSeg: Rethinking Efficient Acquisition of Multi-scale Contextual\n Information for Real-time Semantic Segmentation","summary":" Many current works directly adopt multi-rate depth-wise dilated convolutions\nto capture multi-scale contextual information simultaneously from one input\nfeature map, thus improving the feature extraction efficiency for real-time\nsemantic segmentation. However, this design may lead to difficult access to\nmulti-scale contextual information because of the unreasonable structure and\nhyperparameters. To lower the difficulty of drawing multi-scale contextual\ninformation, we propose a highly efficient multi-scale feature extraction\nmethod, which decomposes the original single-step method into two steps, Region\nResidualization-Semantic Residualization.In this method, the multi-rate\ndepth-wise dilated convolutions take a simpler role in feature extraction:\nperforming simple semantic-based morphological filtering with one desired\nreceptive field in the second step based on each concise feature map of region\nform provided by the first step, to improve their efficiency. Moreover, the\ndilation rates and the capacity of dilated convolutions for each network stage\nare elaborated to fully utilize all the feature maps of region form that can be\nachieved.Accordingly, we design a novel Dilation-wise Residual (DWR) module and\na Simple Inverted Residual (SIR) module for the high and low level network,\nrespectively, and form a powerful DWR Segmentation (DWRSeg) network. Extensive\nexperiments on the Cityscapes and CamVid datasets demonstrate the effectiveness\nof our method by achieving a state-of-the-art trade-off between accuracy and\ninference speed, in addition to being lighter weight. Without pretraining or\nresorting to any training trick, we achieve an mIoU of 72.7% on the Cityscapes\ntest set at a speed of 319.5 FPS on one NVIDIA GeForce GTX 1080 Ti card, which\nexceeds the latest methods of a speed of 69.5 FPS and 0.8% mIoU. The code and\ntrained models are publicly available.\n","authors":["Haoran Wei","Xu Liu","Shouchun Xu","Zhongjian Dai","Yaping Dai","Xiangyang Xu"],"pdf_url":"https://arxiv.org/pdf/2212.01173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05346v1","updated":"2023-09-11T09:45:22Z","published":"2023-09-11T09:45:22Z","title":"Learning Geometric Representations of Objects via Interaction","summary":" We address the problem of learning representations from observations of a\nscene involving an agent and an external object the agent interacts with. To\nthis end, we propose a representation learning framework extracting the\nlocation in physical space of both the agent and the object from unstructured\nobservations of arbitrary nature. Our framework relies on the actions performed\nby the agent as the only source of supervision, while assuming that the object\nis displaced by the agent via unknown dynamics. We provide a theoretical\nfoundation and formally prove that an ideal learner is guaranteed to infer an\nisometric representation, disentangling the agent from the object and correctly\nextracting their locations. We evaluate empirically our framework on a variety\nof scenarios, showing that it outperforms vision-based approaches such as a\nstate-of-the-art keypoint extractor. We moreover demonstrate how the extracted\nrepresentations enable the agent to solve downstream tasks via reinforcement\nlearning in an efficient manner.\n","authors":["Alfredo Reichlin","Giovanni Luca Marchetti","Hang Yin","Anastasiia Varava","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2309.05346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00586v3","updated":"2023-09-11T09:43:35Z","published":"2023-07-02T15:05:15Z","title":"ClipSitu: Effectively Leveraging CLIP for Conditional Predictions in\n Situation Recognition","summary":" Situation Recognition is the task of generating a structured summary of what\nis happening in an image using an activity verb and the semantic roles played\nby actors and objects. In this task, the same activity verb can describe a\ndiverse set of situations as well as the same actor or object category can play\na diverse set of semantic roles depending on the situation depicted in the\nimage. Hence a situation recognition model needs to understand the context of\nthe image and the visual-linguistic meaning of semantic roles. Therefore, we\nleverage the CLIP foundational model that has learned the context of images via\nlanguage descriptions. We show that deeper-and-wider multi-layer perceptron\n(MLP) blocks obtain noteworthy results for the situation recognition task by\nusing CLIP image and text embedding features and it even outperforms the\nstate-of-the-art CoFormer, a Transformer-based model, thanks to the external\nimplicit visual-linguistic knowledge encapsulated by CLIP and the expressive\npower of modern MLP block designs. Motivated by this, we design a\ncross-attention-based Transformer using CLIP visual tokens that model the\nrelation between textual roles and visual entities. Our cross-attention-based\nTransformer known as ClipSitu XTF outperforms existing state-of-the-art by a\nlarge margin of 14.1\\% on semantic role labelling (value) for top-1 accuracy\nusing imSitu dataset. {Similarly, our ClipSitu XTF obtains state-of-the-art\nsituation localization performance.} We will make the code publicly available.\n","authors":["Debaditya Roy","Dhruv Verma","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2307.00586v3.pdf","comment":"State-of-the-art results on Grounded Situation Recognition"},{"id":"http://arxiv.org/abs/2309.05339v1","updated":"2023-09-11T09:35:51Z","published":"2023-09-11T09:35:51Z","title":"PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D\n representations for agricultural robotics","summary":" Precise scene understanding is key for most robot monitoring and intervention\ntasks in agriculture. In this work we present PAg-NeRF which is a novel\nNeRF-based system that enables 3D panoptic scene understanding. Our\nrepresentation is trained using an image sequence with noisy robot odometry\nposes and automatic panoptic predictions with inconsistent IDs between frames.\nDespite this noisy input, our system is able to output scene geometry,\nphoto-realistic renders and 3D consistent panoptic representations with\nconsistent instance IDs. We evaluate this novel system in a very challenging\nhorticultural scenario and in doing so demonstrate an end-to-end trainable\nsystem that can make use of noisy robot poses rather than precise poses that\nhave to be pre-calculated. Compared to a baseline approach the peak signal to\nnoise ratio is improved from 21.34dB to 23.37dB while the panoptic quality\nimproves from 56.65% to 70.08%. Furthermore, our approach is faster and can be\ntuned to improve inference time by more than a factor of 2 while being memory\nefficient with approximately 12 times fewer parameters.\n","authors":["Claus Smitt","Michael Halstead","Patrick Zimmer","Thomas Läbe","Esra Guclu","Cyrill Stachniss","Chris McCool"],"pdf_url":"https://arxiv.org/pdf/2309.05339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05334v1","updated":"2023-09-11T09:32:45Z","published":"2023-09-11T09:32:45Z","title":"MultIOD: Rehearsal-free Multihead Incremental Object Detector","summary":" Class-Incremental learning (CIL) is the ability of artificial agents to\naccommodate new classes as they appear in a stream. It is particularly\ninteresting in evolving environments where agents have limited access to memory\nand computational resources. The main challenge of class-incremental learning\nis catastrophic forgetting, the inability of neural networks to retain past\nknowledge when learning a new one. Unfortunately, most existing\nclass-incremental object detectors are applied to two-stage algorithms such as\nFaster-RCNN and rely on rehearsal memory to retain past knowledge. We believe\nthat the current benchmarks are not realistic, and more effort should be\ndedicated to anchor-free and rehearsal-free object detection. In this context,\nwe propose MultIOD, a class-incremental object detector based on CenterNet. Our\nmain contributions are: (1) we propose a multihead feature pyramid and\nmultihead detection architecture to efficiently separate class representations,\n(2) we employ transfer learning between classes learned initially and those\nlearned incrementally to tackle catastrophic forgetting, and (3) we use a\nclass-wise non-max-suppression as a post-processing technique to remove\nredundant boxes. Without bells and whistles, our method outperforms a range of\nstate-of-the-art methods on two Pascal VOC datasets.\n","authors":["Eden Belouadah","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2309.05334v1.pdf","comment":"Under review at the WACV 2024 conference"},{"id":"http://arxiv.org/abs/2309.05330v1","updated":"2023-09-11T09:26:07Z","published":"2023-09-11T09:26:07Z","title":"Diff-Privacy: Diffusion-based Face Privacy Protection","summary":" Privacy protection has become a top priority as the proliferation of AI\ntechniques has led to widespread collection and misuse of personal data.\nAnonymization and visual identity information hiding are two important facial\nprivacy protection tasks that aim to remove identification characteristics from\nfacial images at the human perception level. However, they have a significant\ndifference in that the former aims to prevent the machine from recognizing\ncorrectly, while the latter needs to ensure the accuracy of machine\nrecognition. Therefore, it is difficult to train a model to complete these two\ntasks simultaneously. In this paper, we unify the task of anonymization and\nvisual identity information hiding and propose a novel face privacy protection\nmethod based on diffusion models, dubbed Diff-Privacy. Specifically, we train\nour proposed multi-scale image inversion module (MSI) to obtain a set of SDM\nformat conditional embeddings of the original image. Based on the conditional\nembeddings, we design corresponding embedding scheduling strategies and\nconstruct different energy functions during the denoising process to achieve\nanonymization and visual identity information hiding. Extensive experiments\nhave been conducted to validate the effectiveness of our proposed framework in\nprotecting facial privacy.\n","authors":["Xiao He","Mingrui Zhu","Dongxin Chen","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2309.05330v1.pdf","comment":"17pages"},{"id":"http://arxiv.org/abs/2309.04190v2","updated":"2023-09-11T09:12:39Z","published":"2023-09-08T08:03:42Z","title":"SegmentAnything helps microscopy images based automatic and quantitative\n organoid detection and analysis","summary":" Organoids are self-organized 3D cell clusters that closely mimic the\narchitecture and function of in vivo tissues and organs. Quantification of\norganoid morphology helps in studying organ development, drug discovery, and\ntoxicity assessment. Recent microscopy techniques provide a potent tool to\nacquire organoid morphology features, but manual image analysis remains a labor\nand time-intensive process. Thus, this paper proposes a comprehensive pipeline\nfor microscopy analysis that leverages the SegmentAnything to precisely\ndemarcate individual organoids. Additionally, we introduce a set of\nmorphological properties, including perimeter, area, radius, non-smoothness,\nand non-circularity, allowing researchers to analyze the organoid structures\nquantitatively and automatically. To validate the effectiveness of our\napproach, we conducted tests on bright-field images of human induced\npluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The\nresults obtained from our automatic pipeline closely align with manual organoid\ndetection and measurement, showcasing the capability of our proposed method in\naccelerating organoids morphology analysis.\n","authors":["Xiaodan Xing","Chunling Tang","Yunzhe Guo","Nicholas Kurniawan","Guang Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04190v2.pdf","comment":"submitted to SPIE: Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2309.05314v1","updated":"2023-09-11T08:59:15Z","published":"2023-09-11T08:59:15Z","title":"Semantic Latent Decomposition with Normalizing Flows for Face Editing","summary":" Navigating in the latent space of StyleGAN has shown effectiveness for face\nediting. However, the resulting methods usually encounter challenges in\ncomplicated navigation due to the entanglement among different attributes in\nthe latent space. To address this issue, this paper proposes a novel framework,\ntermed SDFlow, with a semantic decomposition in original latent space using\ncontinuous conditional normalizing flows. Specifically, SDFlow decomposes the\noriginal latent code into different irrelevant variables by jointly optimizing\ntwo components: (i) a semantic encoder to estimate semantic variables from\ninput faces and (ii) a flow-based transformation module to map the latent code\ninto a semantic-irrelevant variable in Gaussian distribution, conditioned on\nthe learned semantic variables. To eliminate the entanglement between\nvariables, we employ a disentangled learning strategy under a mutual\ninformation framework, thereby providing precise manipulation controls.\nExperimental results demonstrate that SDFlow outperforms existing\nstate-of-the-art face editing methods both qualitatively and quantitatively.\nThe source code is made available at https://github.com/phil329/SDFlow.\n","authors":["Binglei Li","Zhizhong Huang","Hongming Shan","Junping Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14448v2","updated":"2023-09-11T08:56:32Z","published":"2023-08-28T09:35:13Z","title":"ExpCLIP: Bridging Text and Facial Expressions via Semantic Alignment","summary":" The objective of stylized speech-driven facial animation is to create\nanimations that encapsulate specific emotional expressions. Existing methods\noften depend on pre-established emotional labels or facial expression\ntemplates, which may limit the necessary flexibility for accurately conveying\nuser intent. In this research, we introduce a technique that enables the\ncontrol of arbitrary styles by leveraging natural language as emotion prompts.\nThis technique presents benefits in terms of both flexibility and\nuser-friendliness. To realize this objective, we initially construct a\nText-Expression Alignment Dataset (TEAD), wherein each facial expression is\npaired with several prompt-like descriptions.We propose an innovative automatic\nannotation method, supported by Large Language Models (LLMs), to expedite the\ndataset construction, thereby eliminating the substantial expense of manual\nannotation. Following this, we utilize TEAD to train a CLIP-based model, termed\nExpCLIP, which encodes text and facial expressions into semantically aligned\nstyle embeddings. The embeddings are subsequently integrated into the facial\nanimation generator to yield expressive and controllable facial animations.\nGiven the limited diversity of facial emotions in existing speech-driven facial\nanimation training data, we further introduce an effective Expression Prompt\nAugmentation (EPA) mechanism to enable the animation generator to support\nunprecedented richness in style control. Comprehensive experiments illustrate\nthat our method accomplishes expressive facial animation generation and offers\nenhanced flexibility in effectively conveying the desired style.\n","authors":["Yicheng Zhong","Huawei Wei","Peiji Yang","Zhisheng Wang"],"pdf_url":"https://arxiv.org/pdf/2308.14448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05300v1","updated":"2023-09-11T08:35:23Z","published":"2023-09-11T08:35:23Z","title":"DeCUR: decoupling common & unique representations for multimodal\n self-supervision","summary":" The increasing availability of multi-sensor data sparks interest in\nmultimodal self-supervised learning. However, most existing approaches learn\nonly common representations across modalities while ignoring intra-modal\ntraining and modality-unique representations. We propose Decoupling Common and\nUnique Representations (DeCUR), a simple yet effective method for multimodal\nself-supervised learning. By distinguishing inter- and intra-modal embeddings,\nDeCUR is trained to integrate complementary information across different\nmodalities. We evaluate DeCUR in three common multimodal scenarios\n(radar-optical, RGB-elevation, and RGB-depth), and demonstrate its consistent\nbenefits on scene classification and semantic segmentation downstream tasks.\nNotably, we get straightforward improvements by transferring our pretrained\nbackbones to state-of-the-art supervised multimodal methods without any\nhyperparameter tuning. Furthermore, we conduct a comprehensive explainability\nanalysis to shed light on the interpretation of common and unique features in\nour multimodal approach. Codes are available at\n\\url{https://github.com/zhu-xlab/DeCUR}.\n","authors":["Yi Wang","Conrad M Albrecht","Nassim Ait Ali Braham","Chenying Liu","Zhitong Xiong","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.05300v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.08330v2","updated":"2023-09-11T08:34:20Z","published":"2023-06-14T08:01:24Z","title":"Multimodal Optimal Transport-based Co-Attention Transformer with Global\n Structure Consistency for Survival Prediction","summary":" Survival prediction is a complicated ordinal regression task that aims to\npredict the ranking risk of death, which generally benefits from the\nintegration of histology and genomic data. Despite the progress in joint\nlearning from pathology and genomics, existing methods still suffer from\nchallenging issues: 1) Due to the large size of pathological images, it is\ndifficult to effectively represent the gigapixel whole slide images (WSIs). 2)\nInteractions within tumor microenvironment (TME) in histology are essential for\nsurvival analysis. Although current approaches attempt to model these\ninteractions via co-attention between histology and genomic data, they focus on\nonly dense local similarity across modalities, which fails to capture global\nconsistency between potential structures, i.e. TME-related interactions of\nhistology and co-expression of genomic data. To address these challenges, we\npropose a Multimodal Optimal Transport-based Co-Attention Transformer framework\nwith global structure consistency, in which optimal transport (OT) is applied\nto match patches of a WSI and genes embeddings for selecting informative\npatches to represent the gigapixel WSI. More importantly, OT-based co-attention\nprovides a global awareness to effectively capture structural interactions\nwithin TME for survival prediction. To overcome high computational complexity\nof OT, we propose a robust and efficient implementation over micro-batch of WSI\npatches by approximating the original OT with unbalanced mini-batch OT.\nExtensive experiments show the superiority of our method on five benchmark\ndatasets compared to the state-of-the-art methods. The code is released.\n","authors":["Yingxue Xu","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08330v2.pdf","comment":"11 pages, 4 figures, accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2308.12714v2","updated":"2023-09-11T08:29:48Z","published":"2023-08-24T11:21:05Z","title":"VIGC: Visual Instruction Generation and Correction","summary":" The integration of visual encoders and large language models (LLMs) has\ndriven recent progress in multimodal large language models (MLLMs). However,\nthe scarcity of high-quality instruction-tuning data for vision-language tasks\nremains a challenge. The current leading paradigm, such as LLaVA, relies on\nlanguage-only GPT-4 to generate data, which requires pre-annotated image\ncaptions and detection bounding boxes, suffering from understanding image\ndetails. A practical solution to this problem would be to utilize the available\nmultimodal large language models (MLLMs) to generate instruction data for\nvision-language tasks. However, it's worth noting that the currently accessible\nMLLMs are not as powerful as their LLM counterparts, as they tend to produce\ninadequate responses and generate false information. As a solution for\naddressing the current issue, this paper proposes the Visual Instruction\nGeneration and Correction (VIGC) framework that enables multimodal large\nlanguage models to generate instruction-tuning data and progressively enhance\nits quality on-the-fly. Specifically, Visual Instruction Generation (VIG)\nguides the vision-language model to generate diverse instruction-tuning data.\nTo ensure generation quality, Visual Instruction Correction (VIC) adopts an\niterative update mechanism to correct any inaccuracies in data produced by VIG,\neffectively reducing the risk of hallucination. Leveraging the diverse,\nhigh-quality data generated by VIGC, we finetune mainstream models and validate\ndata quality based on various evaluations. Experimental results demonstrate\nthat VIGC not only compensates for the shortcomings of language-only data\ngeneration methods, but also effectively enhances the benchmark performance.\nThe models, datasets, and code are available at\nhttps://opendatalab.github.io/VIGC.\n","authors":["Bin Wang","Fan Wu","Xiao Han","Jiahui Peng","Huaping Zhong","Pan Zhang","Xiaoyi Dong","Weijia Li","Wei Li","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2308.12714v2.pdf","comment":"Project Website: https://opendatalab.github.io/VIGC, Code and\n Pretrained Model: https://github.com/opendatalab/VIGC, Dataset:\n https://opendatalab.com/OpenDataLab/VIGC-InstData"},{"id":"http://arxiv.org/abs/2308.13566v2","updated":"2023-09-11T08:28:40Z","published":"2023-08-25T01:41:04Z","title":"MLLM-DataEngine: An Iterative Refinement Approach for MLLM","summary":" Despite the great advance of Multimodal Large Language Models (MLLMs) in both\ninstruction dataset building and benchmarking, the independence of training and\nevaluation makes current MLLMs hard to further improve their capability under\nthe guidance of evaluation results with a relatively low human cost. In this\npaper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data\ngeneration, model training, and evaluation. Within each loop iteration, the\nMLLM-DataEngine first analyze the weakness of the model based on the evaluation\nresults, then generate a proper incremental dataset for the next training\niteration and enhance the model capability iteratively. Compared with previous\ndata collection methods which are separate from the benchmarking, the data\ngenerated by MLLM-DataEngine shows better targeting, quality, and correctness.\nFor targeting, we propose an Adaptive Bad-case Sampling module, which adjusts\nthe ratio of different types of data within each incremental dataset based on\nthe benchmarking results. For quality, we resort to GPT-4 to generate\nhigh-quality data with each given data type. For correctness, prompt design is\ncritical for the data generation results. Rather than previous hand-crafted\nprompt, we propose an Interactive Prompt Optimization strategy, which optimizes\nthe prompt with the multi-round interaction between human and GPT, and improve\nthe correctness of generated data greatly. Through extensive experiments, we\nfind our MLLM-DataEngine could boost the MLLM capability in a targeted and\nautomatic manner, with only a few human participation. We hope it could be a\ngeneral solution for the following MLLMs building. The MLLM-DataEngine has been\nopen-sourced and is now available at\nhttps://github.com/opendatalab/MLLM-DataEngine.\n","authors":["Zhiyuan Zhao","Linke Ouyang","Bin Wang","Siyuan Huang","Pan Zhang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2308.13566v2.pdf","comment":"Code and models are available at\n https://github.com/opendatalab/MLLM-DataEngine"},{"id":"http://arxiv.org/abs/2303.08888v5","updated":"2023-09-11T08:27:13Z","published":"2023-03-15T19:16:47Z","title":"Stochastic Segmentation with Conditional Categorical Diffusion Models","summary":" Semantic segmentation has made significant progress in recent years thanks to\ndeep neural networks, but the common objective of generating a single\nsegmentation output that accurately matches the image's content may not be\nsuitable for safety-critical domains such as medical diagnostics and autonomous\ndriving. Instead, multiple possible correct segmentation maps may be required\nto reflect the true distribution of annotation maps. In this context,\nstochastic semantic segmentation methods must learn to predict conditional\ndistributions of labels given the image, but this is challenging due to the\ntypically multimodal distributions, high-dimensional output spaces, and limited\nannotation data. To address these challenges, we propose a conditional\ncategorical diffusion model (CCDM) for semantic segmentation based on Denoising\nDiffusion Probabilistic Models. Our model is conditioned to the input image,\nenabling it to generate multiple segmentation label maps that account for the\naleatoric uncertainty arising from divergent ground truth annotations. Our\nexperimental results show that CCDM achieves state-of-the-art performance on\nLIDC, a stochastic semantic segmentation dataset, and outperforms established\nbaselines on the classical segmentation dataset Cityscapes.\n","authors":["Lukas Zbinden","Lars Doorenbos","Theodoros Pissas","Adrian Thomas Huber","Raphael Sznitman","Pablo Márquez-Neila"],"pdf_url":"https://arxiv.org/pdf/2303.08888v5.pdf","comment":"Accepted at ICCV 2023. Code available at\n https://github.com/LarsDoorenbos/ccdm-stochastic-segmentation"},{"id":"http://arxiv.org/abs/2309.05289v1","updated":"2023-09-11T08:16:04Z","published":"2023-09-11T08:16:04Z","title":"Task-driven Compression for Collision Encoding based on Depth Images","summary":" This paper contributes a novel learning-based method for aggressive\ntask-driven compression of depth images and their encoding as images tailored\nto collision prediction for robotic systems. A novel 3D image processing\nmethodology is proposed that accounts for the robot's size in order to\nappropriately \"inflate\" the obstacles represented in the depth image and thus\nobtain the distance that can be traversed by the robot in a collision-free\nmanner along any given ray within the camera frustum. Such depth-and-collision\nimage pairs are used to train a neural network that follows the architecture of\nVariational Autoencoders to compress-and-transform the information in the\noriginal depth image to derive a latent representation that encodes the\ncollision information for the given depth image. We compare our proposed\ntask-driven encoding method with classical task-agnostic methods and\ndemonstrate superior performance for the task of collision image prediction\nfrom extremely low-dimensional latent spaces. A set of comparative studies show\nthat the proposed approach is capable of encoding depth image-and-collision\nimage tuples from complex scenes with thin obstacles at long distances better\nthan the classical methods at compression ratios as high as 4050:1.\n","authors":["Mihir Kulkarni","Kostas Alexis"],"pdf_url":"https://arxiv.org/pdf/2309.05289v1.pdf","comment":"14 pages, 5, figures. Accepted to the International Symposium on\n Visual Computing 2023"},{"id":"http://arxiv.org/abs/2308.16582v2","updated":"2023-09-11T07:44:49Z","published":"2023-08-31T09:27:56Z","title":"Any-Size-Diffusion: Toward Efficient Text-Driven Synthesis for Any-Size\n HD Images","summary":" Stable diffusion, a generative model used in text-to-image synthesis,\nfrequently encounters resolution-induced composition problems when generating\nimages of varying sizes. This issue primarily stems from the model being\ntrained on pairs of single-scale images and their corresponding text\ndescriptions. Moreover, direct training on images of unlimited sizes is\nunfeasible, as it would require an immense number of text-image pairs and\nentail substantial computational expenses. To overcome these challenges, we\npropose a two-stage pipeline named Any-Size-Diffusion (ASD), designed to\nefficiently generate well-composed images of any size, while minimizing the\nneed for high-memory GPU resources. Specifically, the initial stage, dubbed Any\nRatio Adaptability Diffusion (ARAD), leverages a selected set of images with a\nrestricted range of ratios to optimize the text-conditional diffusion model,\nthereby improving its ability to adjust composition to accommodate diverse\nimage sizes. To support the creation of images at any desired size, we further\nintroduce a technique called Fast Seamless Tiled Diffusion (FSTD) at the\nsubsequent stage. This method allows for the rapid enlargement of the ASD\noutput to any high-resolution size, avoiding seaming artifacts or memory\noverloads. Experimental results on the LAION-COCO and MM-CelebA-HQ benchmarks\ndemonstrate that ASD can produce well-structured images of arbitrary sizes,\ncutting down the inference time by 2x compared to the traditional tiled\nalgorithm.\n","authors":["Qingping Zheng","Yuanfan Guo","Jiankang Deng","Jianhua Han","Ying Li","Songcen Xu","Hang Xu"],"pdf_url":"https://arxiv.org/pdf/2308.16582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05282v1","updated":"2023-09-11T07:37:10Z","published":"2023-09-11T07:37:10Z","title":"Can you text what is happening? Integrating pre-trained language\n encoders into trajectory prediction models for autonomous driving","summary":" In autonomous driving tasks, scene understanding is the first step towards\npredicting the future behavior of the surrounding traffic participants. Yet,\nhow to represent a given scene and extract its features are still open research\nquestions. In this study, we propose a novel text-based representation of\ntraffic scenes and process it with a pre-trained language encoder.\n First, we show that text-based representations, combined with classical\nrasterized image representations, lead to descriptive scene embeddings. Second,\nwe benchmark our predictions on the nuScenes dataset and show significant\nimprovements compared to baselines. Third, we show in an ablation study that a\njoint encoder of text and rasterized images outperforms the individual encoders\nconfirming that both representations have their complementary strengths.\n","authors":["Ali Keysan","Andreas Look","Eitan Kosman","Gonca Gürsun","Jörg Wagner","Yao Yu","Barbara Rakitsch"],"pdf_url":"https://arxiv.org/pdf/2309.05282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05281v1","updated":"2023-09-11T07:36:16Z","published":"2023-09-11T07:36:16Z","title":"Class-Incremental Grouping Network for Continual Audio-Visual Learning","summary":" Continual learning is a challenging problem in which models need to be\ntrained on non-stationary data across sequential tasks for class-incremental\nlearning. While previous methods have focused on using either regularization or\nrehearsal-based frameworks to alleviate catastrophic forgetting in image\nclassification, they are limited to a single modality and cannot learn compact\nclass-aware cross-modal representations for continual audio-visual learning. To\naddress this gap, we propose a novel class-incremental grouping network (CIGN)\nthat can learn category-wise semantic features to achieve continual\naudio-visual learning. Our CIGN leverages learnable audio-visual class tokens\nand audio-visual grouping to continually aggregate class-aware features.\nAdditionally, it utilizes class tokens distillation and continual grouping to\nprevent forgetting parameters learned from previous tasks, thereby improving\nthe model's ability to capture discriminative audio-visual categories. We\nconduct extensive experiments on VGGSound-Instruments, VGGSound-100, and\nVGG-Sound Sources benchmarks. Our experimental results demonstrate that the\nCIGN achieves state-of-the-art audio-visual class-incremental learning\nperformance. Code is available at https://github.com/stoneMo/CIGN.\n","authors":["Shentong Mo","Weiguo Pian","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2309.05281v1.pdf","comment":"ICCV 2023. arXiv admin note: text overlap with arXiv:2303.17056"},{"id":"http://arxiv.org/abs/2309.05277v1","updated":"2023-09-11T07:27:32Z","published":"2023-09-11T07:27:32Z","title":"Interactive Class-Agnostic Object Counting","summary":" We propose a novel framework for interactive class-agnostic object counting,\nwhere a human user can interactively provide feedback to improve the accuracy\nof a counter. Our framework consists of two main components: a user-friendly\nvisualizer to gather feedback and an efficient mechanism to incorporate it. In\neach iteration, we produce a density map to show the current prediction result,\nand we segment it into non-overlapping regions with an easily verifiable number\nof objects. The user can provide feedback by selecting a region with obvious\ncounting errors and specifying the range for the estimated number of objects\nwithin it. To improve the counting result, we develop a novel adaptation loss\nto force the visual counter to output the predicted count within the\nuser-specified range. For effective and efficient adaptation, we propose a\nrefinement module that can be used with any density-based visual counter, and\nonly the parameters in the refinement module will be updated during adaptation.\nOur experiments on two challenging class-agnostic object counting benchmarks,\nFSCD-LVIS and FSC-147, show that our method can reduce the mean absolute error\nof multiple state-of-the-art visual counters by roughly 30% to 40% with minimal\nuser input. Our project can be found at\nhttps://yifehuang97.github.io/ICACountProjectPage/.\n","authors":["Yifeng Huang","Viresh Ranjan","Minh Hoai"],"pdf_url":"https://arxiv.org/pdf/2309.05277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10404v2","updated":"2023-09-11T07:22:19Z","published":"2023-07-19T18:19:18Z","title":"Interpreting and Correcting Medical Image Classification with PIP-Net","summary":" Part-prototype models are explainable-by-design image classifiers, and a\npromising alternative to black box AI. This paper explores the applicability\nand potential of interpretable machine learning, in particular PIP-Net, for\nautomated diagnosis support on real-world medical imaging data. PIP-Net learns\nhuman-understandable prototypical image parts and we evaluate its accuracy and\ninterpretability for fracture detection and skin cancer diagnosis. We find that\nPIP-Net's decision making process is in line with medical classification\nstandards, while only provided with image-level class labels. Because of\nPIP-Net's unsupervised pretraining of prototypes, data quality problems such as\nundesired text in an X-ray or labelling errors can be easily identified.\nAdditionally, we are the first to show that humans can manually correct the\nreasoning of PIP-Net by directly disabling undesired prototypes. We conclude\nthat part-prototype models are promising for medical applications due to their\ninterpretability and potential for advanced model debugging.\n","authors":["Meike Nauta","Johannes H. Hegeman","Jeroen Geerdink","Jörg Schlötterer","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.10404v2.pdf","comment":"Accepted to the International Workshop on Explainable and\n Interpretable Machine Learning (XI-ML), co-located with ECAI 2023"},{"id":"http://arxiv.org/abs/2306.06622v2","updated":"2023-09-11T07:11:14Z","published":"2023-06-11T08:46:42Z","title":"Weakly Supervised Visual Question Answer Generation","summary":" Growing interest in conversational agents promote twoway human-computer\ncommunications involving asking and answering visual questions have become an\nactive area of research in AI. Thus, generation of visual questionanswer\npair(s) becomes an important and challenging task. To address this issue, we\npropose a weakly-supervised visual question answer generation method that\ngenerates a relevant question-answer pairs for a given input image and\nassociated caption. Most of the prior works are supervised and depend on the\nannotated question-answer datasets. In our work, we present a weakly supervised\nmethod that synthetically generates question-answer pairs procedurally from\nvisual information and captions. The proposed method initially extracts list of\nanswer words, then does nearest question generation that uses the caption and\nanswer word to generate synthetic question. Next, the relevant question\ngenerator converts the nearest question to relevant language question by\ndependency parsing and in-order tree traversal, finally, fine-tune a ViLBERT\nmodel with the question-answer pair(s) generated at end. We perform an\nexhaustive experimental analysis on VQA dataset and see that our model\nsignificantly outperform SOTA methods on BLEU scores. We also show the results\nwrt baseline models and ablation study.\n","authors":["Charani Alampalle","Shamanthak Hegde","Soumya Jahagirdar","Shankar Gangisetty"],"pdf_url":"https://arxiv.org/pdf/2306.06622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05271v1","updated":"2023-09-11T07:05:02Z","published":"2023-09-11T07:05:02Z","title":"AutoFuse: Automatic Fusion Networks for Deformable Medical Image\n Registration","summary":" Deformable image registration aims to find a dense non-linear spatial\ncorrespondence between a pair of images, which is a crucial step for many\nmedical tasks such as tumor growth monitoring and population analysis.\nRecently, Deep Neural Networks (DNNs) have been widely recognized for their\nability to perform fast end-to-end registration. However, DNN-based\nregistration needs to explore the spatial information of each image and fuse\nthis information to characterize spatial correspondence. This raises an\nessential question: what is the optimal fusion strategy to characterize spatial\ncorrespondence? Existing fusion strategies (e.g., early fusion, late fusion)\nwere empirically designed to fuse information by manually defined prior\nknowledge, which inevitably constrains the registration performance within the\nlimits of empirical designs. In this study, we depart from existing\nempirically-designed fusion strategies and develop a data-driven fusion\nstrategy for deformable image registration. To achieve this, we propose an\nAutomatic Fusion network (AutoFuse) that provides flexibility to fuse\ninformation at many potential locations within the network. A Fusion Gate (FG)\nmodule is also proposed to control how to fuse information at each potential\nnetwork location based on training data. Our AutoFuse can automatically\noptimize its fusion strategy during training and can be generalizable to both\nunsupervised registration (without any labels) and semi-supervised registration\n(with weak labels provided for partial training data). Extensive experiments on\ntwo well-benchmarked medical registration tasks (inter- and intra-patient\nregistration) with eight public datasets show that our AutoFuse outperforms\nstate-of-the-art unsupervised and semi-supervised registration methods.\n","authors":["Mingyuan Meng","Michael Fulham","Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2309.05271v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2303.08416v5","updated":"2023-09-11T07:04:50Z","published":"2023-03-15T07:31:55Z","title":"Lung Nodule Segmentation and Uncertain Region Prediction with an\n Uncertainty-Aware Attention Mechanism","summary":" Radiologists possess diverse training and clinical experiences, leading to\nvariations in the segmentation annotations of lung nodules and resulting in\nsegmentation uncertainty.Conventional methods typically select a single\nannotation as the learning target or attempt to learn a latent space comprising\nmultiple annotations. However, these approaches fail to leverage the valuable\ninformation inherent in the consensus and disagreements among the multiple\nannotations. In this paper, we propose an Uncertainty-Aware Attention Mechanism\n(UAAM) that utilizes consensus and disagreements among multiple annotations to\nfacilitate better segmentation. To this end, we introduce the Multi-Confidence\nMask (MCM), which combines a Low-Confidence (LC) Mask and a High-Confidence\n(HC) Mask.The LC mask indicates regions with low segmentation confidence, where\nradiologists may have different segmentation choices. Following UAAM, we\nfurther design an Uncertainty-Guide Multi-Confidence Segmentation Network\n(UGMCS-Net), which contains three modules: a Feature Extracting Module that\ncaptures a general feature of a lung nodule, an Uncertainty-Aware Module that\nproduces three features for the the annotations' union, intersection, and\nannotation set, and an Intersection-Union Constraining Module that uses\ndistances between the three features to balance the predictions of final\nsegmentation and MCM. To comprehensively demonstrate the performance of our\nmethod, we propose a Complex Nodule Validation on LIDC-IDRI, which tests\nUGMCS-Net's segmentation performance on lung nodules that are difficult to\nsegment using common methods. Experimental results demonstrate that our method\ncan significantly improve the segmentation performance on nodules that are\ndifficult to segment using conventional methods.\n","authors":["Han Yang","Qiuli Wang","Yue Zhang","Zhulin An","Chen Liu","Xiaohong Zhang","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2303.08416v5.pdf","comment":"10 pages, 10 figures. We have reported a preliminary version of this\n work in MICCAI 2022"},{"id":"http://arxiv.org/abs/2309.01380v2","updated":"2023-09-11T07:01:24Z","published":"2023-09-04T06:11:00Z","title":"Understanding Video Scenes through Text: Insights from Text-based Video\n Question Answering","summary":" Researchers have extensively studied the field of vision and language,\ndiscovering that both visual and textual content is crucial for understanding\nscenes effectively. Particularly, comprehending text in videos holds great\nsignificance, requiring both scene text understanding and temporal reasoning.\nThis paper focuses on exploring two recently introduced datasets, NewsVideoQA\nand M4-ViteVQA, which aim to address video question answering based on textual\ncontent. The NewsVideoQA dataset contains question-answer pairs related to the\ntext in news videos, while M4-ViteVQA comprises question-answer pairs from\ndiverse categories like vlogging, traveling, and shopping. We provide an\nanalysis of the formulation of these datasets on various levels, exploring the\ndegree of visual understanding and multi-frame comprehension required for\nanswering the questions. Additionally, the study includes experimentation with\nBERT-QA, a text-only model, which demonstrates comparable performance to the\noriginal methods on both datasets, indicating the shortcomings in the\nformulation of these datasets. Furthermore, we also look into the domain\nadaptation aspect by examining the effectiveness of training on M4-ViteVQA and\nevaluating on NewsVideoQA and vice-versa, thereby shedding light on the\nchallenges and potential benefits of out-of-domain training.\n","authors":["Soumya Jahagirdar","Minesh Mathew","Dimosthenis Karatzas","C. V. Jawahar"],"pdf_url":"https://arxiv.org/pdf/2309.01380v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05267v1","updated":"2023-09-11T06:55:32Z","published":"2023-09-11T06:55:32Z","title":"Diving into Darkness: A Dual-Modulated Framework for High-Fidelity\n Super-Resolution in Ultra-Dark Environments","summary":" Super-resolution tasks oriented to images captured in ultra-dark environments\nis a practical yet challenging problem that has received little attention. Due\nto uneven illumination and low signal-to-noise ratio in dark environments, a\nmultitude of problems such as lack of detail and color distortion may be\nmagnified in the super-resolution process compared to normal-lighting\nenvironments. Consequently, conventional low-light enhancement or\nsuper-resolution methods, whether applied individually or in a cascaded manner\nfor such problem, often encounter limitations in recovering luminance, color\nfidelity, and intricate details. To conquer these issues, this paper proposes a\nspecialized dual-modulated learning framework that, for the first time,\nattempts to deeply dissect the nature of the low-light super-resolution task.\nLeveraging natural image color characteristics, we introduce a self-regularized\nluminance constraint as a prior for addressing uneven lighting. Expanding on\nthis, we develop Illuminance-Semantic Dual Modulation (ISDM) components to\nenhance feature-level preservation of illumination and color details. Besides,\ninstead of deploying naive up-sampling strategies, we design the\nResolution-Sensitive Merging Up-sampler (RSMU) module that brings together\ndifferent sampling modalities as substrates, effectively mitigating the\npresence of artifacts and halos. Comprehensive experiments showcases the\napplicability and generalizability of our approach to diverse and challenging\nultra-low-light conditions, outperforming state-of-the-art methods with a\nnotable improvement (i.e., $\\uparrow$5\\% in PSNR, and $\\uparrow$43\\% in LPIPS).\nEspecially noteworthy is the 19-fold increase in the RMSE score, underscoring\nour method's exceptional generalization across different darkness levels. The\ncode will be available online upon publication of the paper.\n","authors":["Jiaxin Gao","Ziyu Yue","Yaohua Liu","Sihan Xie","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05267v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2309.05262v1","updated":"2023-09-11T06:38:27Z","published":"2023-09-11T06:38:27Z","title":"A horizon line annotation tool for streamlining autonomous sea\n navigation experiments","summary":" Horizon line (or sea line) detection (HLD) is a critical component in\nmultiple marine autonomous navigation tasks, such as identifying the navigation\narea (i.e., the sea), obstacle detection and geo-localization, and digital\nvideo stabilization. A recent survey highlighted several weaknesses of such\ndetectors, particularly on sea conditions lacking from the most extensive\ndataset currently used by HLD researchers. Experimental validation of more\nrobust HLDs involves collecting an extensive set of these lacking sea\nconditions and annotating each collected image with the correct position and\norientation of the horizon line. The annotation task is daunting without a\nproper tool. Therefore, we present the first public annotation software with\ntailored features to make the sea line annotation process fast and easy. The\nsoftware is available at:\nhttps://drive.google.com/drive/folders/1c0ZmvYDckuQCPIWfh_70P7E1A_DWlIvF?usp=sharing\n","authors":["Yassir Zardoua","Abdelhamid El Wahabi","Mohammed Boulaala","Abdelali Astito"],"pdf_url":"https://arxiv.org/pdf/2309.05262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05261v1","updated":"2023-09-11T06:37:12Z","published":"2023-09-11T06:37:12Z","title":"Gall Bladder Cancer Detection from US Images with Only Image Level\n Labels","summary":" Automated detection of Gallbladder Cancer (GBC) from Ultrasound (US) images\nis an important problem, which has drawn increased interest from researchers.\nHowever, most of these works use difficult-to-acquire information such as\nbounding box annotations or additional US videos. In this paper, we focus on\nGBC detection using only image-level labels. Such annotation is usually\navailable based on the diagnostic report of a patient, and do not require\nadditional annotation effort from the physicians. However, our analysis reveals\nthat it is difficult to train a standard image classification model for GBC\ndetection. This is due to the low inter-class variance (a malignant region\nusually occupies only a small portion of a US image), high intra-class variance\n(due to the US sensor capturing a 2D slice of a 3D object leading to large\nviewpoint variations), and low training data availability. We posit that even\nwhen we have only the image level label, still formulating the problem as\nobject detection (with bounding box output) helps a deep neural network (DNN)\nmodel focus on the relevant region of interest. Since no bounding box\nannotations is available for training, we pose the problem as weakly supervised\nobject detection (WSOD). Motivated by the recent success of transformer models\nin object detection, we train one such model, DETR, using\nmulti-instance-learning (MIL) with self-supervised instance selection to suit\nthe WSOD task. Our proposed method demonstrates an improvement of AP and\ndetection sensitivity over the SOTA transformer-based and CNN-based WSOD\nmethods. Project page is at https://gbc-iitd.github.io/wsod-gbc\n","authors":["Soumen Basu","Ashish Papanai","Mayank Gupta","Pankaj Gupta","Chetan Arora"],"pdf_url":"https://arxiv.org/pdf/2309.05261v1.pdf","comment":"Accepted at MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.05257v1","updated":"2023-09-11T06:27:25Z","published":"2023-09-11T06:27:25Z","title":"FusionFormer: A Multi-sensory Fusion in Bird's-Eye-View and Temporal\n Consistent Transformer for 3D Objection","summary":" Multi-sensor modal fusion has demonstrated strong advantages in 3D object\ndetection tasks. However, existing methods that fuse multi-modal features\nthrough a simple channel concatenation require transformation features into\nbird's eye view space and may lose the information on Z-axis thus leads to\ninferior performance. To this end, we propose FusionFormer, an end-to-end\nmulti-modal fusion framework that leverages transformers to fuse multi-modal\nfeatures and obtain fused BEV features. And based on the flexible adaptability\nof FusionFormer to the input modality representation, we propose a depth\nprediction branch that can be added to the framework to improve detection\nperformance in camera-based detection tasks. In addition, we propose a\nplug-and-play temporal fusion module based on transformers that can fuse\nhistorical frame BEV features for more stable and reliable detection results.\nWe evaluate our method on the nuScenes dataset and achieve 72.6% mAP and 75.1%\nNDS for 3D object detection tasks, outperforming state-of-the-art methods.\n","authors":["Chunyong Hu","Hang Zheng","Kun Li","Jianyun Xu","Weibo Mao","Maochun Luo","Lingxuan Wang","Mingxia Chen","Kaixuan Liu","Yiru Zhao","Peihan Hao","Minzhe Liu","Kaicheng Yu"],"pdf_url":"https://arxiv.org/pdf/2309.05257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05254v1","updated":"2023-09-11T06:18:05Z","published":"2023-09-11T06:18:05Z","title":"Towards Better Data Exploitation In Self-Supervised Monocular Depth\n Estimation","summary":" Depth estimation plays an important role in the robotic perception system.\nSelf-supervised monocular paradigm has gained significant attention since it\ncan free training from the reliance on depth annotations. Despite recent\nadvancements, existing self-supervised methods still underutilize the available\ntraining data, limiting their generalization ability. In this paper, we take\ntwo data augmentation techniques, namely Resizing-Cropping and\nSplitting-Permuting, to fully exploit the potential of training datasets.\nSpecifically, the original image and the generated two augmented images are fed\ninto the training pipeline simultaneously and we leverage them to conduct\nself-distillation. Additionally, we introduce the detail-enhanced DepthNet with\nan extra full-scale branch in the encoder and a grid decoder to enhance the\nrestoration of fine details in depth maps. Experimental results demonstrate our\nmethod can achieve state-of-the-art performance on the KITTI benchmark, with\nboth raw ground truth and improved ground truth. Moreover, our models also show\nsuperior generalization performance when transferring to Make3D and NYUv2\ndatasets. Our codes are available at https://github.com/Sauf4896/BDEdepth.\n","authors":["Jinfeng Liu","Lingtong Kong","Jie Yang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05254v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.03048v2","updated":"2023-09-11T06:12:53Z","published":"2023-09-06T14:43:22Z","title":"Exploring Semantic Consistency in Unpaired Image Translation to Generate\n Data for Surgical Applications","summary":" In surgical computer vision applications, obtaining labeled training data is\nchallenging due to data-privacy concerns and the need for expert annotation.\nUnpaired image-to-image translation techniques have been explored to\nautomatically generate large annotated datasets by translating synthetic images\nto the realistic domain. However, preserving the structure and semantic\nconsistency between the input and translated images presents significant\nchallenges, mainly when there is a distributional mismatch in the semantic\ncharacteristics of the domains. This study empirically investigates unpaired\nimage translation methods for generating suitable data in surgical\napplications, explicitly focusing on semantic consistency. We extensively\nevaluate various state-of-the-art image translation models on two challenging\nsurgical datasets and downstream semantic segmentation tasks. We find that a\nsimple combination of structural-similarity loss and contrastive learning\nyields the most promising results. Quantitatively, we show that the data\ngenerated with this approach yields higher semantic consistency and can be used\nmore effectively as training data.\n","authors":["Danush Kumar Venkatesh","Dominik Rivoir","Micha Pfeiffer","Fiona Kolbinger","Marius Distler","Jürgen Weitz","Stefanie Speidel"],"pdf_url":"https://arxiv.org/pdf/2309.03048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11114v2","updated":"2023-09-11T06:04:24Z","published":"2023-03-20T13:55:35Z","title":"SeiT: Storage-Efficient Vision Training with Tokens Using 1% of Pixel\n Storage","summary":" We need billion-scale images to achieve more generalizable and\nground-breaking vision models, as well as massive dataset storage to ship the\nimages (e.g., the LAION-4B dataset needs 240TB storage space). However, it has\nbecome challenging to deal with unlimited dataset storage with limited storage\ninfrastructure. A number of storage-efficient training methods have been\nproposed to tackle the problem, but they are rarely scalable or suffer from\nsevere damage to performance. In this paper, we propose a storage-efficient\ntraining strategy for vision classifiers for large-scale datasets (e.g.,\nImageNet) that only uses 1024 tokens per instance without using the raw level\npixels; our token storage only needs <1% of the original JPEG-compressed raw\npixels. We also propose token augmentations and a Stem-adaptor module to make\nour approach able to use the same architecture as pixel-based approaches with\nonly minimal modifications on the stem layer and the carefully tuned\noptimization settings. Our experimental results on ImageNet-1k show that our\nmethod significantly outperforms other storage-efficient training methods with\na large gap. We further show the effectiveness of our method in other practical\nscenarios, storage-efficient pre-training, and continual learning. Code is\navailable at https://github.com/naver-ai/seit\n","authors":["Song Park","Sanghyuk Chun","Byeongho Heo","Wonjae Kim","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2303.11114v2.pdf","comment":"ICCV 2023; First two authors contributed equally; code url:\n https://github.com/naver-ai/seit; 17 pages, 1.2MB"},{"id":"http://arxiv.org/abs/2309.05251v1","updated":"2023-09-11T06:03:39Z","published":"2023-09-11T06:03:39Z","title":"Multi3DRefer: Grounding Text Description to Multiple 3D Objects","summary":" We introduce the task of localizing a flexible number of objects in\nreal-world 3D scenes using natural language descriptions. Existing 3D visual\ngrounding tasks focus on localizing a unique object given a text description.\nHowever, such a strict setting is unnatural as localizing potentially multiple\nobjects is a common need in real-world scenarios and robotic tasks (e.g.,\nvisual navigation and object rearrangement). To address this setting we propose\nMulti3DRefer, generalizing the ScanRefer dataset and task. Our dataset contains\n61926 descriptions of 11609 objects, where zero, single or multiple target\nobjects are referenced by each description. We also introduce a new evaluation\nmetric and benchmark methods from prior work to enable further investigation of\nmulti-modal 3D scene understanding. Furthermore, we develop a better baseline\nleveraging 2D features from CLIP by rendering object proposals online with\ncontrastive learning, which outperforms the state of the art on the ScanRefer\nbenchmark.\n","authors":["Yiming Zhang","ZeMing Gong","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2309.05251v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2306.15931v2","updated":"2023-09-11T05:53:14Z","published":"2023-06-28T05:32:22Z","title":"Boosting Adversarial Transferability with Learnable Patch-wise Masks","summary":" Adversarial examples have attracted widespread attention in security-critical\napplications because of their transferability across different models. Although\nmany methods have been proposed to boost adversarial transferability, a gap\nstill exists between capabilities and practical demand. In this paper, we argue\nthat the model-specific discriminative regions are a key factor causing\noverfitting to the source model, and thus reducing the transferability to the\ntarget model. For that, a patch-wise mask is utilized to prune the\nmodel-specific regions when calculating adversarial perturbations. To\naccurately localize these regions, we present a learnable approach to\nautomatically optimize the mask. Specifically, we simulate the target models in\nour framework, and adjust the patch-wise mask according to the feedback of the\nsimulated models. To improve the efficiency, the differential evolutionary (DE)\nalgorithm is utilized to search for patch-wise masks for a specific image.\nDuring iterative attacks, the learned masks are applied to the image to drop\nout the patches related to model-specific regions, thus making the gradients\nmore generic and improving the adversarial transferability. The proposed\napproach is a preprocessing method and can be integrated with existing methods\nto further boost the transferability. Extensive experiments on the ImageNet\ndataset demonstrate the effectiveness of our method. We incorporate the\nproposed approach with existing methods to perform ensemble attacks and achieve\nan average success rate of 93.01% against seven advanced defense methods, which\ncan effectively enhance the state-of-the-art transfer-based attack performance.\n","authors":["Xingxing Wei","Shiji Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.15931v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06977v2","updated":"2023-09-11T05:21:24Z","published":"2023-04-14T07:55:38Z","title":"DeePoint: Visual Pointing Recognition and Direction Estimation","summary":" In this paper, we realize automatic visual recognition and direction\nestimation of pointing. We introduce the first neural pointing understanding\nmethod based on two key contributions. The first is the introduction of a\nfirst-of-its-kind large-scale dataset for pointing recognition and direction\nestimation, which we refer to as the DP Dataset. DP Dataset consists of more\nthan 2 million frames of 33 people pointing in various styles annotated for\neach frame with pointing timings and 3D directions. The second is DeePoint, a\nnovel deep network model for joint recognition and 3D direction estimation of\npointing. DeePoint is a Transformer-based network which fully leverages the\nspatio-temporal coordination of the body parts, not just the hands. Through\nextensive experiments, we demonstrate the accuracy and efficiency of DeePoint.\nWe believe DP Dataset and DeePoint will serve as a sound foundation for visual\nhuman intention understanding.\n","authors":["Shu Nakamura","Yasutomo Kawanishi","Shohei Nobuhara","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2304.06977v2.pdf","comment":"to be published in ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05239v1","updated":"2023-09-11T05:17:55Z","published":"2023-09-11T05:17:55Z","title":"HAT: Hybrid Attention Transformer for Image Restoration","summary":" Transformer-based methods have shown impressive performance in image\nrestoration tasks, such as image super-resolution and denoising. However, we\nfind that these networks can only utilize a limited spatial range of input\ninformation through attribution analysis. This implies that the potential of\nTransformer is still not fully exploited in existing networks. In order to\nactivate more input pixels for better restoration, we propose a new Hybrid\nAttention Transformer (HAT). It combines both channel attention and\nwindow-based self-attention schemes, thus making use of their complementary\nadvantages. Moreover, to better aggregate the cross-window information, we\nintroduce an overlapping cross-attention module to enhance the interaction\nbetween neighboring window features. In the training stage, we additionally\nadopt a same-task pre-training strategy to further exploit the potential of the\nmodel for further improvement. Extensive experiments have demonstrated the\neffectiveness of the proposed modules. We further scale up the model to show\nthat the performance of the SR task can be greatly improved. Besides, we extend\nHAT to more image restoration applications, including real-world image\nsuper-resolution, Gaussian image denoising and image compression artifacts\nreduction. Experiments on benchmark and real-world datasets demonstrate that\nour HAT achieves state-of-the-art performance both quantitatively and\nqualitatively. Codes and models are publicly available at\nhttps://github.com/XPixelGroup/HAT.\n","authors":["Xiangyu Chen","Xintao Wang","Wenlong Zhang","Xiangtao Kong","Yu Qiao","Jiantao Zhou","Chao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.05239v1.pdf","comment":"Extended version of HAT"},{"id":"http://arxiv.org/abs/2307.14611v3","updated":"2023-09-11T05:15:31Z","published":"2023-07-27T03:56:39Z","title":"TextManiA: Enriching Visual Feature by Text-driven Manifold Augmentation","summary":" We propose TextManiA, a text-driven manifold augmentation method that\nsemantically enriches visual feature spaces, regardless of class distribution.\nTextManiA augments visual data with intra-class semantic perturbation by\nexploiting easy-to-understand visually mimetic words, i.e., attributes. This\nwork is built on an interesting hypothesis that general language models, e.g.,\nBERT and GPT, encompass visual information to some extent, even without\ntraining on visual training data. Given the hypothesis, TextManiA transfers\npre-trained text representation obtained from a well-established large language\nencoder to a target visual feature space being learned. Our extensive analysis\nhints that the language encoder indeed encompasses visual information at least\nuseful to augment visual representation. Our experiments demonstrate that\nTextManiA is particularly powerful in scarce samples with class imbalance as\nwell as even distribution. We also show compatibility with the label mix-based\napproaches in evenly distributed scarce data.\n","authors":["Moon Ye-Bin","Jisoo Kim","Hongyeob Kim","Kilho Son","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2307.14611v3.pdf","comment":"Accepted at ICCV 2023. [Project Pages] https://textmania.github.io/"},{"id":"http://arxiv.org/abs/2308.00994v2","updated":"2023-09-11T05:06:38Z","published":"2023-08-02T07:59:25Z","title":"SYNAuG: Exploiting Synthetic Data for Data Imbalance Problems","summary":" We live in an era of data floods, and deep neural networks play a pivotal\nrole in this moment. Natural data inherently exhibits several challenges such\nas long-tailed distribution and model fairness, where data imbalance is at the\ncenter of fundamental issues. This imbalance poses a risk of deep neural\nnetworks producing biased predictions, leading to potentially severe ethical\nand social problems. To address these problems, we leverage the recent\ngenerative models advanced in generating high-quality images. In this work, we\npropose SYNAuG, which utilizes synthetic data to uniformize the given imbalance\ndistribution followed by a simple post-calibration step considering the domain\ngap between real and synthetic data. This straightforward approach yields\nimpressive performance on datasets for distinctive data imbalance problems such\nas CIFAR100-LT, ImageNet100-LT, UTKFace, and Waterbirds, surpassing the\nperformance of existing task-specific methods. While we do not claim that our\napproach serves as a complete solution to the problem of data imbalance, we\nargue that supplementing the existing data with synthetic data proves to be an\neffective and crucial step in addressing data imbalance concerns.\n","authors":["Moon Ye-Bin","Nam Hyeon-Woo","Wonseok Choi","Nayeong Kim","Suha Kwak","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2308.00994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19862v2","updated":"2023-09-11T04:59:19Z","published":"2023-05-31T13:55:00Z","title":"Self-supervised Learning to Bring Dual Reversed Rolling Shutter Images\n Alive","summary":" Modern consumer cameras usually employ the rolling shutter (RS) mechanism,\nwhere images are captured by scanning scenes row-by-row, yielding RS\ndistortions for dynamic scenes. To correct RS distortions, existing methods\nadopt a fully supervised learning manner, where high framerate global shutter\n(GS) images should be collected as ground-truth supervision. In this paper, we\npropose a Self-supervised learning framework for Dual reversed RS distortions\nCorrection (SelfDRSC), where a DRSC network can be learned to generate a high\nframerate GS video only based on dual RS images with reversed distortions. In\nparticular, a bidirectional distortion warping module is proposed for\nreconstructing dual reversed RS images, and then a self-supervised loss can be\ndeployed to train DRSC network by enhancing the cycle consistency between input\nand reconstructed dual reversed RS images. Besides start and end RS scanning\ntime, GS images at arbitrary intermediate scanning time can also be supervised\nin SelfDRSC, thus enabling the learned DRSC network to generate a high\nframerate GS video. Moreover, a simple yet effective self-distillation strategy\nis introduced in self-supervised loss for mitigating boundary artifacts in\ngenerated GS images. On synthetic dataset, SelfDRSC achieves better or\ncomparable quantitative metrics in comparison to state-of-the-art methods\ntrained in the full supervision manner. On real-world RS cases, our SelfDRSC\ncan produce high framerate GS videos with finer correction textures and better\ntemporary consistency. The source code and trained models are made publicly\navailable at https://github.com/shangwei5/SelfDRSC. We also provide an\nimplementation in HUAWEI Mindspore at\nhttps://github.com/Hunter-Will/SelfDRSC-mindspore.\n","authors":["Wei Shang","Dongwei Ren","Chaoyu Feng","Xiaotao Wang","Lei Lei","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2305.19862v2.pdf","comment":"16 pages, 16 figures, available at\n https://github.com/shangwei5/SelfDRSC"},{"id":"http://arxiv.org/abs/2205.07611v3","updated":"2023-09-11T04:23:25Z","published":"2022-05-16T12:14:03Z","title":"Noise-Tolerant Learning for Audio-Visual Action Recognition","summary":" Recently, video recognition is emerging with the help of multi-modal\nlearning, which focuses on integrating distinct modalities to improve the\nperformance or robustness of the model. Although various multi-modal learning\nmethods have been proposed and offer remarkable recognition results, almost all\nof these methods rely on high-quality manual annotations and assume that\nmodalities among multi-modal data provide semantically relevant information.\nUnfortunately, the widely used video datasets are usually coarse-annotated or\ncollected from the Internet. Thus, it inevitably contains a portion of noisy\nlabels and noisy correspondence. To address this challenge, we use the\naudio-visual action recognition task as a proxy and propose a noise-tolerant\nlearning framework to find anti-interference model parameters against both\nnoisy labels and noisy correspondence. Specifically, our method consists of two\nphases that aim to rectify noise by the inherent correlation between\nmodalities. First, a noise-tolerant contrastive training phase is performed to\nmake the model immune to the possible noisy-labeled data. To alleviate the\ninfluence of noisy correspondence, we propose a cross-modal noise estimation\ncomponent to adjust the consistency between different modalities. As the noisy\ncorrespondence existed at the instance level, we further propose a\ncategory-level contrastive loss to reduce its interference. Second, in the\nhybrid-supervised training phase, we calculate the distance metric among\nfeatures to obtain corrected labels, which are used as complementary\nsupervision to guide the training. Extensive experiments on a wide range of\nnoisy levels demonstrate that our method significantly improves the robustness\nof the action recognition model and surpasses the baselines by a clear margin.\n","authors":["Haochen Han","Qinghua Zheng","Minnan Luo","Kaiyao Miao","Feng Tian","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2205.07611v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.05224v1","updated":"2023-09-11T04:03:43Z","published":"2023-09-11T04:03:43Z","title":"SparseSwin: Swin Transformer with Sparse Transformer Block","summary":" Advancements in computer vision research have put transformer architecture as\nthe state of the art in computer vision tasks. One of the known drawbacks of\nthe transformer architecture is the high number of parameters, this can lead to\na more complex and inefficient algorithm. This paper aims to reduce the number\nof parameters and in turn, made the transformer more efficient. We present\nSparse Transformer (SparTa) Block, a modified transformer block with an\naddition of a sparse token converter that reduces the number of tokens used. We\nuse the SparTa Block inside the Swin T architecture (SparseSwin) to leverage\nSwin capability to downsample its input and reduce the number of initial tokens\nto be calculated. The proposed SparseSwin model outperforms other state of the\nart models in image classification with an accuracy of 86.96%, 97.43%, and\n85.35% on the ImageNet100, CIFAR10, and CIFAR100 datasets respectively. Despite\nits fewer parameters, the result highlights the potential of a transformer\narchitecture using a sparse token converter with a limited number of tokens to\noptimize the use of the transformer and improve its performance.\n","authors":["Krisna Pinasthika","Blessius Sheldo Putra Laksono","Riyandi Banovbi Putera Irsal","Syifa Hukma Shabiyya","Novanto Yudistira"],"pdf_url":"https://arxiv.org/pdf/2309.05224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10417v2","updated":"2023-09-11T04:03:27Z","published":"2023-08-21T01:59:45Z","title":"The Change You Want to See (Now in 3D)","summary":" The goal of this paper is to detect what has changed, if anything, between\ntwo \"in the wild\" images of the same 3D scene acquired from different camera\npositions and at different temporal instances. The open-set nature of this\nproblem, occlusions/dis-occlusions due to the shift in viewpoint, and the lack\nof suitable training datasets, presents substantial challenges in devising a\nsolution.\n To address this problem, we contribute a change detection model that is\ntrained entirely on synthetic data and is class-agnostic, yet it is performant\nout-of-the-box on real world images without requiring fine-tuning. Our solution\nentails a \"register and difference\" approach that leverages self-supervised\nfrozen embeddings and feature differences, which allows the model to generalise\nto a wide variety of scenes and domains. The model is able to operate directly\non two RGB images, without requiring access to ground truth camera intrinsics,\nextrinsics, depth maps, point clouds, or additional before-after images.\nFinally, we collect and release a new evaluation dataset consisting of\nreal-world image pairs with human-annotated differences and demonstrate the\nefficacy of our method. The code, datasets and pre-trained model can be found\nat: https://github.com/ragavsachdeva/CYWS-3D\n","authors":["Ragav Sachdeva","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2308.10417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01361v3","updated":"2023-09-11T03:50:57Z","published":"2023-09-04T05:05:15Z","title":"High Frequency, High Accuracy Pointing onboard Nanosats using\n Neuromorphic Event Sensing and Piezoelectric Actuation","summary":" As satellites become smaller, the ability to maintain stable pointing\ndecreases as external forces acting on the satellite come into play. At the\nsame time, reaction wheels used in the attitude determination and control\nsystem (ADCS) introduce high frequency jitter which can disrupt pointing\nstability. For space domain awareness (SDA) tasks that track objects tens of\nthousands of kilometres away, the pointing accuracy offered by current\nnanosats, typically in the range of 10 to 100 arcseconds, is not sufficient. In\nthis work, we develop a novel payload that utilises a neuromorphic event sensor\n(for high frequency and highly accurate relative attitude estimation) paired in\na closed loop with a piezoelectric stage (for active attitude corrections) to\nprovide highly stable sensor-specific pointing. Event sensors are especially\nsuited for space applications due to their desirable characteristics of low\npower consumption, asynchronous operation, and high dynamic range. We use the\nevent sensor to first estimate a reference background star field from which\ninstantaneous relative attitude is estimated at high frequency. The\npiezoelectric stage works in a closed control loop with the event sensor to\nperform attitude corrections based on the discrepancy between the current and\ndesired attitude. Results in a controlled setting show that we can achieve a\npointing accuracy in the range of 1-5 arcseconds using our novel payload at an\noperating frequency of up to 50Hz using a prototype built from\ncommercial-off-the-shelf components. Further details can be found at\nhttps://ylatif.github.io/ultrafinestabilisation\n","authors":["Yasir Latif","Peter Anastasiou","Yonhon Ng","Zebb Prime","Tien-Fu Lu","Matthew Tetlow","Robert Mahony","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2309.01361v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05214v1","updated":"2023-09-11T03:20:41Z","published":"2023-09-11T03:20:41Z","title":"Angle Range and Identity Similarity Enhanced Gaze and Head Redirection\n based on Synthetic data","summary":" In this paper, we propose a method for improving the angular accuracy and\nphoto-reality of gaze and head redirection in full-face images. The problem\nwith current models is that they cannot handle redirection at large angles, and\nthis limitation mainly comes from the lack of training data. To resolve this\nproblem, we create data augmentation by monocular 3D face reconstruction to\nextend the head pose and gaze range of the real data, which allows the model to\nhandle a wider redirection range. In addition to the main focus on data\naugmentation, we also propose a framework with better image quality and\nidentity preservation of unseen subjects even training with synthetic data.\nExperiments show that our method significantly improves redirection performance\nin terms of redirection angular accuracy while maintaining high image quality,\nespecially when redirecting to large angles.\n","authors":["Jiawei Qin","Xueting Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05214v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12006v2","updated":"2023-09-11T03:01:50Z","published":"2023-08-23T08:49:43Z","title":"Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action\n and Gesture Recognition","summary":" RGB-D action and gesture recognition remain an interesting topic in\nhuman-centered scene understanding, primarily due to the multiple granularities\nand large variation in human motion. Although many RGB-D based action and\ngesture recognition approaches have demonstrated remarkable results by\nutilizing highly integrated spatio-temporal representations across multiple\nmodalities (i.e., RGB and depth data), they still encounter several challenges.\nFirstly, vanilla 3D convolution makes it hard to capture fine-grained motion\ndifferences between local clips under different modalities. Secondly, the\nintricate nature of highly integrated spatio-temporal modeling can lead to\noptimization difficulties. Thirdly, duplicate and unnecessary information can\nadd complexity and complicate entangled spatio-temporal modeling. To address\nthe above issues, we propose an innovative heuristic architecture called\nMulti-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture\nrecognition. The proposed MFST model comprises a 3D Central Difference\nConvolution Stem (CDC-Stem) module and multiple factorized spatio-temporal\nstages. The CDC-Stem enriches fine-grained temporal perception, and the\nmultiple hierarchical spatio-temporal stages construct dimension-independent\nhigher-order semantic primitives. Specifically, the CDC-Stem module captures\nbottom-level spatio-temporal features and passes them successively to the\nfollowing spatio-temporal factored stages to capture the hierarchical spatial\nand temporal features through the Multi- Scale Convolution and Transformer\n(MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans)\nblock. The seamless integration of these innovative designs results in a robust\nspatio-temporal representation that outperforms state-of-the-art approaches on\nRGB-D action and gesture recognition datasets.\n","authors":["Yujun Ma","Benjia Zhou","Ruili Wang","Pichao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.12006v2.pdf","comment":"ACM MM'23"},{"id":"http://arxiv.org/abs/2211.07108v2","updated":"2023-09-11T03:00:32Z","published":"2022-11-14T04:51:05Z","title":"Recursive Cross-View: Use Only 2D Detectors to Achieve 3D Object\n Detection without 3D Annotations","summary":" Heavily relying on 3D annotations limits the real-world application of 3D\nobject detection. In this paper, we propose a method that does not demand any\n3D annotation, while being able to predict full-oriented 3D bounding boxes. Our\nmethod, called Recursive Cross-View (RCV), transforms 3D detection into several\n2D detection tasks, which only consume some 2D labels, based on the three-view\nprinciple. We propose a recursive paradigm, in which instance segmentation and\n3D bounding box generation by Cross-View are implemented recursively until\nconvergence. Specifically, a frustum is proposed via a 2D detector, followed by\nthe recursive paradigm that finally outputs a full-oriented 3D box, class, and\nscore. To justify that our method can be quickly used to new tasks in\nreal-world scenarios, we do three experiments, namely indoor 3D human\ndetection, full-oriented 3D hand detection, and real-time detection on a real\n3D sensor. RCV achieves decent performance in these experiments. Once trained,\nour method can be viewed as a 3D annotation tool. Consequently, we formulate\ntwo 3D labeled dataset, namely '3D_HUMAN' and 'D_HAND', based on RCV, which\ncould be used to pre-train other 3D detectors. Furthermore, estimated on the\nSUN RGB-D benchmark, our method achieves comparable performance with some full\n3D supervised learning methods. RCV is the first 3D detection method that does\nnot consume 3D labels and yields full-oriented 3D boxes on point clouds.\n","authors":["Shun Gui","Yan Luximon"],"pdf_url":"https://arxiv.org/pdf/2211.07108v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.05209v1","updated":"2023-09-11T02:56:56Z","published":"2023-09-11T02:56:56Z","title":"Phase-Specific Augmented Reality Guidance for Microscopic Cataract\n Surgery Using Long-Short Spatiotemporal Aggregation Transformer","summary":" Phacoemulsification cataract surgery (PCS) is a routine procedure conducted\nusing a surgical microscope, heavily reliant on the skill of the\nophthalmologist. While existing PCS guidance systems extract valuable\ninformation from surgical microscopic videos to enhance intraoperative\nproficiency, they suffer from non-phasespecific guidance, leading to redundant\nvisual information. In this study, our major contribution is the development of\na novel phase-specific augmented reality (AR) guidance system, which offers\ntailored AR information corresponding to the recognized surgical phase.\nLeveraging the inherent quasi-standardized nature of PCS procedures, we propose\na two-stage surgical microscopic video recognition network. In the first stage,\nwe implement a multi-task learning structure to segment the surgical limbus\nregion and extract limbus region-focused spatial feature for each frame. In the\nsecond stage, we propose the long-short spatiotemporal aggregation transformer\n(LS-SAT) network to model local fine-grained and global temporal relationships,\nand combine the extracted spatial features to recognize the current surgical\nphase. Additionally, we collaborate closely with ophthalmologists to design AR\nvisual cues by utilizing techniques such as limbus ellipse fitting and regional\nrestricted normal cross-correlation rotation computation. We evaluated the\nnetwork on publicly available and in-house datasets, with comparison results\ndemonstrating its superior performance compared to related works. Ablation\nresults further validated the effectiveness of the limbus region-focused\nspatial feature extractor and the combination of temporal features.\nFurthermore, the developed system was evaluated in a clinical setup, with\nresults indicating remarkable accuracy and real-time performance. underscoring\nits potential for clinical applications.\n","authors":["Puxun Tu","Hongfei Ye","Haochen Shi","Jeff Young","Meng Xie","Peiquan Zhao","Ce Zheng","Xiaoyi Jiang","Xiaojun Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09026v6","updated":"2023-09-11T02:52:12Z","published":"2023-03-16T01:39:11Z","title":"Commonsense Knowledge Assisted Deep Learning with Application to\n Size-Related Fine-Grained Object Detection","summary":" In this paper, we focus on a scenario where a single image contains objects\nof the same category but varying sizes, and we propose a lightweight approach\nthat can not only recognize their category labels but also their real sizes.\nOur approach utilizes commonsense knowledge to assist a deep neural network\n(DNN) based coarse-grained object detector to achieve accurate size-related\nfine-grained detection. Specifically, we introduce a commonsense knowledge\ninference module (CKIM) that maps the coarse-grained labels produced by the DL\ndetector to size-related fine-grained labels. Experimental results demonstrate\nthat our approach achieves accurate fine-grained detections with a reduced\namount of annotated data, and smaller model size, compared with baseline\nmethods. Our code is available at: https://github.com/ZJLAB-AMMI/CKIM.\n","authors":["Pu Zhang","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2303.09026v6.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2309.05197v1","updated":"2023-09-11T02:20:28Z","published":"2023-09-11T02:20:28Z","title":"Learning Sequential Acquisition Policies for Robot-Assisted Feeding","summary":" A robot providing mealtime assistance must perform specialized maneuvers with\nvarious utensils in order to pick up and feed a range of food items. Beyond\nthese dexterous low-level skills, an assistive robot must also plan these\nstrategies in sequence over a long horizon to clear a plate and complete a\nmeal. Previous methods in robot-assisted feeding introduce highly specialized\nprimitives for food handling without a means to compose them together.\nMeanwhile, existing approaches to long-horizon manipulation lack the\nflexibility to embed highly specialized primitives into their frameworks. We\npropose Visual Action Planning OveR Sequences (VAPORS), a framework for\nlong-horizon food acquisition. VAPORS learns a policy for high-level action\nselection by leveraging learned latent plate dynamics in simulation. To carry\nout sequential plans in the real world, VAPORS delegates action execution to\nvisually parameterized primitives. We validate our approach on complex\nreal-world acquisition trials involving noodle acquisition and bimanual\nscooping of jelly beans. Across 38 plates, VAPORS acquires much more\nefficiently than baselines, generalizes across realistic plate variations such\nas toppings and sauces, and qualitatively appeals to user feeding preferences\nin a survey conducted across 49 individuals. Code, datasets, videos, and\nsupplementary materials can be found on our website:\nhttps://sites.google.com/view/vaporsbot.\n","authors":["Priya Sundaresan","Jiajun Wu","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2309.05197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01961v3","updated":"2023-09-11T02:15:30Z","published":"2023-09-05T05:32:19Z","title":"NICE: CVPR 2023 Challenge on Zero-shot Image Captioning","summary":" In this report, we introduce NICE (New frontiers for zero-shot Image\nCaptioning Evaluation) project and share the results and outcomes of 2023\nchallenge. This project is designed to challenge the computer vision community\nto develop robust image captioning models that advance the state-of-the-art\nboth in terms of accuracy and fairness. Through the challenge, the image\ncaptioning models were tested using a new evaluation dataset that includes a\nlarge variety of visual concepts from many domains. There was no specific\ntraining data provided for the challenge, and therefore the challenge entries\nwere required to adapt to new types of image descriptions that had not been\nseen during training. This report includes information on the newly proposed\nNICE dataset, evaluation methods, challenge results, and technical details of\ntop-ranking entries. We expect that the outcomes of the challenge will\ncontribute to the improvement of AI models on various vision-language tasks.\n","authors":["Taehoon Kim","Pyunghwan Ahn","Sangyun Kim","Sihaeng Lee","Mark Marsden","Alessandra Sala","Seung Hwan Kim","Bohyung Han","Kyoung Mu Lee","Honglak Lee","Kyounghoon Bae","Xiangyu Wu","Yi Gao","Hailiang Zhang","Yang Yang","Weili Guo","Jianfeng Lu","Youngtaek Oh","Jae Won Cho","Dong-jin Kim","In So Kweon","Junmo Kim","Wooyoung Kang","Won Young Jhoo","Byungseok Roh","Jonghwan Mun","Solgil Oh","Kenan Emir Ak","Gwang-Gook Lee","Yan Xu","Mingwei Shen","Kyomin Hwang","Wonsik Shin","Kamin Lee","Wonhark Park","Dongkwan Lee","Nojun Kwak","Yujin Wang","Yimu Wang","Tiancheng Gu","Xingchang Lv","Mingmao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.01961v3.pdf","comment":"Tech report, project page https://nice.lgresearch.ai/"},{"id":"http://arxiv.org/abs/2309.05192v1","updated":"2023-09-11T02:10:07Z","published":"2023-09-11T02:10:07Z","title":"Towards Viewpoint Robustness in Bird's Eye View Segmentation","summary":" Autonomous vehicles (AV) require that neural networks used for perception be\nrobust to different viewpoints if they are to be deployed across many types of\nvehicles without the repeated cost of data collection and labeling for each. AV\ncompanies typically focus on collecting data from diverse scenarios and\nlocations, but not camera rig configurations, due to cost. As a result, only a\nsmall number of rig variations exist across most fleets. In this paper, we\nstudy how AV perception models are affected by changes in camera viewpoint and\npropose a way to scale them across vehicle types without repeated data\ncollection and labeling. Using bird's eye view (BEV) segmentation as a\nmotivating task, we find through extensive experiments that existing perception\nmodels are surprisingly sensitive to changes in camera viewpoint. When trained\nwith data from one camera rig, small changes to pitch, yaw, depth, or height of\nthe camera at inference time lead to large drops in performance. We introduce a\ntechnique for novel view synthesis and use it to transform collected data to\nthe viewpoint of target rigs, allowing us to train BEV segmentation models for\ndiverse target rigs without any additional data collection or labeling cost. To\nanalyze the impact of viewpoint changes, we leverage synthetic data to mitigate\nother gaps (content, ISP, etc). Our approach is then trained on real data and\nevaluated on synthetic data, enabling evaluation on diverse target rigs. We\nrelease all data for use in future work. Our method is able to recover an\naverage of 14.7% of the IoU that is otherwise lost when deploying to new rigs.\n","authors":["Tzofi Klinghoffer","Jonah Philion","Wenzheng Chen","Or Litany","Zan Gojcic","Jungseock Joo","Ramesh Raskar","Sanja Fidler","Jose M. Alvarez"],"pdf_url":"https://arxiv.org/pdf/2309.05192v1.pdf","comment":"ICCV 2023. Project Page:\n https://nvlabs.github.io/viewpoint-robustness"},{"id":"http://arxiv.org/abs/2202.06997v5","updated":"2023-09-11T02:06:28Z","published":"2022-02-14T19:29:08Z","title":"Cross-Modality Neuroimage Synthesis: A Survey","summary":" The existence of completely aligned and paired multi-modal neuroimaging data\nhas proved its effectiveness in diagnosis of brain diseases. However,\ncollecting the full set of well-aligned and paired data is expensive or even\nimpractical, since the practical difficulties may include high cost, long time\nacquisition, image corruption, and privacy issues. A realistic solution is to\nexplore either an unsupervised learning or a semi-supervised learning to\nsynthesize the absent neuroimaging data. In this paper, we are the first one to\ncomprehensively approach cross-modality neuroimage synthesis task from\ndifferent perspectives, which include the level of the supervision (especially\nfor weakly-supervised and unsupervised), loss function, evaluation metrics, the\nrange of modality synthesis, datasets (aligned, private and public) and the\nsynthesis-based downstream tasks. To begin with, we highlight several opening\nchallenges for cross-modality neuroimage sysnthesis. Then we summarize the\narchitecture of cross-modality synthesis under various of supervision level. In\naddition, we provide in-depth analysis of how cross-modality neuroimage\nsynthesis can improve the performance of different downstream tasks. Finally,\nwe re-evaluate the open challenges and point out the future directions for the\nremaining challenges. All resources are available at\nhttps://github.com/M-3LAB/awesome-multimodal-brain-image-systhesis\n","authors":["Guoyang Xie","Jinbao Wang","Yawen Huang","Jiayi Lyu","Feng Zheng","Yefeng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2202.06997v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04699v2","updated":"2023-09-11T02:00:51Z","published":"2023-08-09T04:34:21Z","title":"GIFD: A Generative Gradient Inversion Method with Feature Domain\n Optimization","summary":" Federated Learning (FL) has recently emerged as a promising distributed\nmachine learning framework to preserve clients' privacy, by allowing multiple\nclients to upload the gradients calculated from their local data to a central\nserver. Recent studies find that the exchanged gradients also take the risk of\nprivacy leakage, e.g., an attacker can invert the shared gradients and recover\nsensitive data against an FL system by leveraging pre-trained generative\nadversarial networks (GAN) as prior knowledge. However, performing gradient\ninversion attacks in the latent space of the GAN model limits their expression\nability and generalizability. To tackle these challenges, we propose\n\\textbf{G}radient \\textbf{I}nversion over \\textbf{F}eature \\textbf{D}omains\n(GIFD), which disassembles the GAN model and searches the feature domains of\nthe intermediate layers. Instead of optimizing only over the initial latent\ncode, we progressively change the optimized layer, from the initial latent\nspace to intermediate layers closer to the output images. In addition, we\ndesign a regularizer to avoid unreal image generation by adding a small ${l_1}$\nball constraint to the searching range. We also extend GIFD to the\nout-of-distribution (OOD) setting, which weakens the assumption that the\ntraining sets of GANs and FL tasks obey the same data distribution. Extensive\nexperiments demonstrate that our method can achieve pixel-level reconstruction\nand is superior to the existing methods. Notably, GIFD also shows great\ngeneralizability under different defense strategy settings and batch sizes.\n","authors":["Hao Fang","Bin Chen","Xuan Wang","Zhi Wang","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2308.04699v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05186v1","updated":"2023-09-11T01:24:13Z","published":"2023-09-11T01:24:13Z","title":"HiLM-D: Towards High-Resolution Understanding in Multimodal Large\n Language Models for Autonomous Driving","summary":" Autonomous driving systems generally employ separate models for different\ntasks resulting in intricate designs. For the first time, we leverage singular\nmultimodal large language models (MLLMs) to consolidate multiple autonomous\ndriving tasks from videos, i.e., the Risk Object Localization and Intention and\nSuggestion Prediction (ROLISP) task. ROLISP uses natural language to\nsimultaneously identify and interpret risk objects, understand ego-vehicle\nintentions, and provide motion suggestions, eliminating the necessity for\ntask-specific architectures. However, lacking high-resolution (HR) information,\nexisting MLLMs often miss small objects (e.g., traffic cones) and overly focus\non salient ones (e.g., large trucks) when applied to ROLISP. We propose HiLM-D\n(Towards High-Resolution Understanding in MLLMs for Autonomous Driving), an\nefficient method to incorporate HR information into MLLMs for the ROLISP task.\nEspecially, HiLM-D integrates two branches: (i) the low-resolution reasoning\nbranch, can be any MLLMs, processes low-resolution videos to caption risk\nobjects and discern ego-vehicle intentions/suggestions; (ii) the\nhigh-resolution perception branch (HR-PB), prominent to HiLM-D,, ingests HR\nimages to enhance detection by capturing vision-specific HR feature maps and\nprioritizing all potential risks over merely salient objects. Our HR-PB serves\nas a plug-and-play module, seamlessly fitting into current MLLMs. Experiments\non the ROLISP benchmark reveal HiLM-D's notable advantage over leading MLLMs,\nwith improvements of 4.8% in BLEU-4 for captioning and 17.2% in mIoU for\ndetection.\n","authors":["Xinpeng Ding","Jianhua Han","Hang Xu","Wei Zhang","Xiaomeng Li"],"pdf_url":"https://arxiv.org/pdf/2309.05186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05180v1","updated":"2023-09-11T00:32:26Z","published":"2023-09-11T00:32:26Z","title":"Our Deep CNN Face Matchers Have Developed Achromatopsia","summary":" Modern deep CNN face matchers are trained on datasets containing color\nimages. We show that such matchers achieve essentially the same accuracy on the\ngrayscale or the color version of a set of test images. We then consider\npossible causes for deep CNN face matchers ``not seeing color''. Popular\nweb-scraped face datasets actually have 30 to 60\\% of their identities with one\nor more grayscale images. We analyze whether this grayscale element in the\ntraining set impacts the accuracy achieved, and conclude that it does not.\nFurther, we show that even with a 100\\% grayscale training set, comparable\naccuracy is achieved on color or grayscale test images. Then we show that the\nskin region of an individual's images in a web-scraped training set exhibit\nsignificant variation in their mapping to color space. This suggests that\ncolor, at least for web-scraped, in-the-wild face datasets, carries limited\nidentity-related information for training state-of-the-art matchers. Finally,\nwe verify that comparable accuracy is achieved from training using\nsingle-channel grayscale images, implying that a larger dataset can be used\nwithin the same memory limit, with a less computationally intensive early\nlayer.\n","authors":["Aman Bhatta","Domingo Mery","Haiyu Wu","Joyce Annan","Micheal C. King","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2309.05180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06900v2","updated":"2023-09-11T00:29:12Z","published":"2023-02-14T08:35:33Z","title":"Over-Sampling Strategy in Feature Space for Graphs based\n Class-imbalanced Bot Detection","summary":" The presence of a large number of bots in Online Social Networks (OSN) leads\nto undesirable social effects. Graph neural networks (GNNs) are effective in\ndetecting bots as they utilize user interactions. However, class-imbalanced\nissues can affect bot detection performance. To address this, we propose an\nover-sampling strategy for GNNs (OS-GNN) that generates samples for the\nminority class without edge synthesis. First, node features are mapped to a\nfeature space through neighborhood aggregation. Then, we generate samples for\nthe minority class in the feature space. Finally, the augmented features are\nused to train the classifiers. This framework is general and can be easily\nextended into different GNN architectures. The proposed framework is evaluated\nusing three real-world bot detection benchmark datasets, and it consistently\nexhibits superiority over the baselines.\n","authors":["Shuhao Shi","Kai Qiao","Jie Yang","Baojie Song","Jian Chen","Bin Yan"],"pdf_url":"https://arxiv.org/pdf/2302.06900v2.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.05173v1","updated":"2023-09-11T00:02:05Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":" Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving over 20% memory and time costs compared to vanilla PT\nand its variants, without changing trainable parameter sizes. Through extensive\nexperiments on 23 natural language processing (NLP) and vision-language (VL)\ntasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,\nincluding the full fine-tuning baseline in some scenarios. Additionally, we\nempirically show that DEPT grows more efficient as the model size increases.\nOur further study reveals that DePT integrates seamlessly with\nparameter-efficient transfer learning in the few-shot learning setting and\nhighlights its adaptability to various model architectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v1.pdf","comment":"Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2308.09437v2","updated":"2023-09-11T22:49:08Z","published":"2023-08-18T10:07:46Z","title":"From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the\n Right Reasons in Latent Space","summary":" Deep Neural Networks are prone to learning spurious correlations embedded in\nthe training data, leading to potentially biased predictions. This poses risks\nwhen deploying these models for high-stake decision-making, such as in medical\napplications. Current methods for post-hoc model correction either require\ninput-level annotations, which are only possible for spatially localized\nbiases, or augment the latent feature space, thereby hoping to enforce the\nright reasons. We present a novel method ensuring the right reasons on the\nconcept level by reducing the model's sensitivity towards biases through the\ngradient. When modeling biases via Concept Activation Vectors, we highlight the\nimportance of choosing robust directions, as traditional regression-based\napproaches such as Support Vector Machines tend to result in diverging\ndirections. We effectively mitigate biases in controlled and real-world\nsettings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet\nand EfficientNet architectures.\n","authors":["Maximilian Dreyer","Frederik Pahde","Christopher J. Anders","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2308.09437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05857v1","updated":"2023-09-11T22:41:52Z","published":"2023-09-11T22:41:52Z","title":"Radiomics Boosts Deep Learning Model for IPMN Classification","summary":" Intraductal Papillary Mucinous Neoplasm (IPMN) cysts are pre-malignant\npancreas lesions, and they can progress into pancreatic cancer. Therefore,\ndetecting and stratifying their risk level is of ultimate importance for\neffective treatment planning and disease control. However, this is a highly\nchallenging task because of the diverse and irregular shape, texture, and size\nof the IPMN cysts as well as the pancreas. In this study, we propose a novel\ncomputer-aided diagnosis pipeline for IPMN risk classification from\nmulti-contrast MRI scans. Our proposed analysis framework includes an efficient\nvolumetric self-adapting segmentation strategy for pancreas delineation,\nfollowed by a newly designed deep learning-based classification scheme with a\nradiomics-based predictive approach. We test our proposed decision-fusion model\nin multi-center data sets of 246 multi-contrast MRI scans and obtain superior\nperformance to the state of the art (SOTA) in this field. Our ablation studies\ndemonstrate the significance of both radiomics and deep learning modules for\nachieving the new SOTA performance compared to international guidelines and\npublished studies (81.9\\% vs 61.3\\% in accuracy). Our findings have important\nimplications for clinical decision-making. In a series of rigorous experiments\non multi-center data sets (246 MRI scans from five centers), we achieved\nunprecedented performance (81.9\\% accuracy).\n","authors":["Lanhong Yao","Zheyuan Zhang","Ugur Demir","Elif Keles","Camila Vendrami","Emil Agarunov","Candice Bolan","Ivo Schoots","Marc Bruno","Rajesh Keswani","Frank Miller","Tamas Gonda","Cemal Yazici","Temel Tirkes","Michael Wallace","Concetto Spampinato","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2309.05857v1.pdf","comment":"10 pages, MICCAI MLMI 2023"},{"id":"http://arxiv.org/abs/2309.05840v1","updated":"2023-09-11T21:53:34Z","published":"2023-09-11T21:53:34Z","title":"Self-Correlation and Cross-Correlation Learning for Few-Shot Remote\n Sensing Image Semantic Segmentation","summary":" Remote sensing image semantic segmentation is an important problem for remote\nsensing image interpretation. Although remarkable progress has been achieved,\nexisting deep neural network methods suffer from the reliance on massive\ntraining data. Few-shot remote sensing semantic segmentation aims at learning\nto segment target objects from a query image using only a few annotated support\nimages of the target class. Most existing few-shot learning methods stem\nprimarily from their sole focus on extracting information from support images,\nthereby failing to effectively address the large variance in appearance and\nscales of geographic objects. To tackle these challenges, we propose a\nSelf-Correlation and Cross-Correlation Learning Network for the few-shot remote\nsensing image semantic segmentation. Our model enhances the generalization by\nconsidering both self-correlation and cross-correlation between support and\nquery images to make segmentation predictions. To further explore the\nself-correlation with the query image, we propose to adopt a classical spectral\nmethod to produce a class-agnostic segmentation mask based on the basic visual\ninformation of the image. Extensive experiments on two remote sensing image\ndatasets demonstrate the effectiveness and superiority of our model in few-shot\nremote sensing image semantic segmentation. Code and models will be accessed at\nhttps://github.com/linhanwang/SCCNe.\n","authors":["Linhan Wang","Shuo Lei","Jianfeng He","Shengkun Wang","Min Zhang","Chang-Tien Lu"],"pdf_url":"https://arxiv.org/pdf/2309.05840v1.pdf","comment":"10 pages, 6 figures. Accepted to Sigspatial 2023. arXiv admin note:\n text overlap with arXiv:2104.01538 by other authors"},{"id":"http://arxiv.org/abs/2309.05834v1","updated":"2023-09-11T21:32:13Z","published":"2023-09-11T21:32:13Z","title":"SCD-Net: Spatiotemporal Clues Disentanglement Network for\n Self-supervised Skeleton-based Action Recognition","summary":" Contrastive learning has achieved great success in skeleton-based action\nrecognition. However, most existing approaches encode the skeleton sequences as\nentangled spatiotemporal representations and confine the contrasts to the same\nlevel of representation. Instead, this paper introduces a novel contrastive\nlearning framework, namely Spatiotemporal Clues Disentanglement Network\n(SCD-Net). Specifically, we integrate the decoupling module with a feature\nextractor to derive explicit clues from spatial and temporal domains\nrespectively. As for the training of SCD-Net, with a constructed global anchor,\nwe encourage the interaction between the anchor and extracted clues. Further,\nwe propose a new masking strategy with structural constraints to strengthen the\ncontextual associations, leveraging the latest development from masked image\nmodelling into the proposed SCD-Net. We conduct extensive evaluations on the\nNTU-RGB+D (60&120) and PKU-MMD (I&II) datasets, covering various downstream\ntasks such as action recognition, action retrieval, transfer learning, and\nsemi-supervised learning. The experimental results demonstrate the\neffectiveness of our method, which outperforms the existing state-of-the-art\n(SOTA) approaches significantly.\n","authors":["Cong Wu","Xiao-Jun Wu","Josef Kittler","Tianyang Xu","Sara Atito","Muhammad Awais","Zhenhua Feng"],"pdf_url":"https://arxiv.org/pdf/2309.05834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.15675v4","updated":"2023-09-11T21:25:14Z","published":"2020-10-28T03:20:34Z","title":"Deep DA for Ordinal Regression of Pain Intensity Estimation Using\n Weakly-Labeled Videos","summary":" Automatic estimation of pain intensity from facial expressions in videos has\nan immense potential in health care applications. However, domain adaptation\n(DA) is needed to alleviate the problem of domain shifts that typically occurs\nbetween video data captured in source and target do-mains. Given the laborious\ntask of collecting and annotating videos, and the subjective bias due to\nambiguity among adjacent intensity levels, weakly-supervised learning (WSL)is\ngaining attention in such applications. Yet, most state-of-the-art WSL models\nare typically formulated as regression problems, and do not leverage the\nordinal relation between intensity levels, nor the temporal coherence of\nmultiple consecutive frames. This paper introduces a new deep learn-ing model\nfor weakly-supervised DA with ordinal regression(WSDA-OR), where videos in\ntarget domain have coarse la-bels provided on a periodic basis. The WSDA-OR\nmodel enforces ordinal relationships among the intensity levels as-signed to\nthe target sequences, and associates multiple relevant frames to sequence-level\nlabels (instead of a single frame). In particular, it learns discriminant and\ndomain-invariant feature representations by integrating multiple in-stance\nlearning with deep adversarial DA, where soft Gaussian labels are used to\nefficiently represent the weak ordinal sequence-level labels from the target\ndomain. The proposed approach was validated on the RECOLA video dataset as\nfully-labeled source domain, and UNBC-McMaster video data as weakly-labeled\ntarget domain. We have also validated WSDA-OR on BIOVID and Fatigue (private)\ndatasets for sequence level estimation. Experimental results indicate that our\napproach can provide a significant improvement over the state-of-the-art\nmodels, allowing to achieve a greater localization accuracy.\n","authors":["Gnana Praveen R","Eric Granger","Patrick Cardinal"],"pdf_url":"https://arxiv.org/pdf/2010.15675v4.pdf","comment":"due to multiple submission"},{"id":"http://arxiv.org/abs/2309.05832v1","updated":"2023-09-11T21:18:15Z","published":"2023-09-11T21:18:15Z","title":"Instance-Agnostic Geometry and Contact Dynamics Learning","summary":" This work presents an instance-agnostic learning framework that fuses vision\nwith dynamics to simultaneously learn shape, pose trajectories and physical\nproperties via the use of geometry as a shared representation. Unlike many\ncontact learning approaches that assume motion capture input and a known shape\nprior for the collision model, our proposed framework learns an object's\ngeometric and dynamic properties from RGBD video, without requiring either\ncategory-level or instance-level shape priors. We integrate a vision system,\nBundleSDF, with a dynamics system, ContactNets and propose a cyclic training\npipeline to use the output from the dynamics module to refine the poses and the\ngeometry from the vision module, using perspective reprojection. Experiments\ndemonstrate our framework's ability to learn the geometry and dynamics of rigid\nand convex objects and improve upon the current tracking framework.\n","authors":["Mengti Sun","Bowen Jiang","Bibit Bianchini","Camillo Jose Taylor","Michael Posa"],"pdf_url":"https://arxiv.org/pdf/2309.05832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05829v1","updated":"2023-09-11T21:16:41Z","published":"2023-09-11T21:16:41Z","title":"Mobile Vision Transformer-based Visual Object Tracking","summary":" The introduction of robust backbones, such as Vision Transformers, has\nimproved the performance of object tracking algorithms in recent years.\nHowever, these state-of-the-art trackers are computationally expensive since\nthey have a large number of model parameters and rely on specialized hardware\n(e.g., GPU) for faster inference. On the other hand, recent lightweight\ntrackers are fast but are less accurate, especially on large-scale datasets. We\npropose a lightweight, accurate, and fast tracking algorithm using Mobile\nVision Transformers (MobileViT) as the backbone for the first time. We also\npresent a novel approach of fusing the template and search region\nrepresentations in the MobileViT backbone, thereby generating superior feature\nencoding for target localization. The experimental results show that our\nMobileViT-based Tracker, MVT, surpasses the performance of recent lightweight\ntrackers on the large-scale datasets GOT10k and TrackingNet, and with a high\ninference speed. In addition, our method outperforms the popular DiMP-50\ntracker despite having 4.7 times fewer model parameters and running at 2.8\ntimes its speed on a GPU. The tracker code and models are available at\nhttps://github.com/goutamyg/MVT\n","authors":["Goutam Yelluru Gopal","Maria A. Amer"],"pdf_url":"https://arxiv.org/pdf/2309.05829v1.pdf","comment":"Accepted by BMVC2023. Code available at\n https://github.com/goutamyg/MVT"},{"id":"http://arxiv.org/abs/2309.05826v1","updated":"2023-09-11T21:11:48Z","published":"2023-09-11T21:11:48Z","title":"KD-FixMatch: Knowledge Distillation Siamese Neural Networks","summary":" Semi-supervised learning (SSL) has become a crucial approach in deep learning\nas a way to address the challenge of limited labeled data. The success of deep\nneural networks heavily relies on the availability of large-scale high-quality\nlabeled data. However, the process of data labeling is time-consuming and\nunscalable, leading to shortages in labeled data. SSL aims to tackle this\nproblem by leveraging additional unlabeled data in the training process. One of\nthe popular SSL algorithms, FixMatch, trains identical weight-sharing teacher\nand student networks simultaneously using a siamese neural network (SNN).\nHowever, it is prone to performance degradation when the pseudo labels are\nheavily noisy in the early training stage. We present KD-FixMatch, a novel SSL\nalgorithm that addresses the limitations of FixMatch by incorporating knowledge\ndistillation. The algorithm utilizes a combination of sequential and\nsimultaneous training of SNNs to enhance performance and reduce performance\ndegradation. Firstly, an outer SNN is trained using labeled and unlabeled data.\nAfter that, the network of the well-trained outer SNN generates pseudo labels\nfor the unlabeled data, from which a subset of unlabeled data with trusted\npseudo labels is then carefully created through high-confidence sampling and\ndeep embedding clustering. Finally, an inner SNN is trained with the labeled\ndata, the unlabeled data, and the subset of unlabeled data with trusted pseudo\nlabels. Experiments on four public data sets demonstrate that KD-FixMatch\noutperforms FixMatch in all cases. Our results indicate that KD-FixMatch has a\nbetter training starting point that leads to improved model performance\ncompared to FixMatch.\n","authors":["Chien-Chih Wang","Shaoyuan Xu","Jinmiao Fu","Yang Liu","Bryan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05826v1.pdf","comment":"5 pages, 1 figure, 5 tables. To be published in ICIP 2023"},{"id":"http://arxiv.org/abs/2309.05818v1","updated":"2023-09-11T20:51:21Z","published":"2023-09-11T20:51:21Z","title":"Rice Plant Disease Detection and Diagnosis using Deep Convolutional\n Neural Networks and Multispectral Imaging","summary":" Rice is considered a strategic crop in Egypt as it is regularly consumed in\nthe Egyptian people's diet. Even though Egypt is the highest rice producer in\nAfrica with a share of 6 million tons per year, it still imports rice to\nsatisfy its local needs due to production loss, especially due to rice disease.\nRice blast disease is responsible for 30% loss in rice production worldwide.\nTherefore, it is crucial to target limiting yield damage by detecting rice\ncrops diseases in its early stages. This paper introduces a public\nmultispectral and RGB images dataset and a deep learning pipeline for rice\nplant disease detection using multi-modal data. The collected multispectral\nimages consist of Red, Green and Near-Infrared channels and we show that using\nmultispectral along with RGB channels as input archives a higher F1 accuracy\ncompared to using RGB input only.\n","authors":["Yara Ali Alnaggar","Ahmad Sebaq","Karim Amer","ElSayed Naeem","Mohamed Elhelw"],"pdf_url":"https://arxiv.org/pdf/2309.05818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05810v1","updated":"2023-09-11T20:28:18Z","published":"2023-09-11T20:28:18Z","title":"SHIFT3D: Synthesizing Hard Inputs For Tricking 3D Detectors","summary":" We present SHIFT3D, a differentiable pipeline for generating 3D shapes that\nare structurally plausible yet challenging to 3D object detectors. In\nsafety-critical applications like autonomous driving, discovering such novel\nchallenging objects can offer insight into unknown vulnerabilities of 3D\ndetectors. By representing objects with a signed distanced function (SDF), we\nshow that gradient error signals allow us to smoothly deform the shape or pose\nof a 3D object in order to confuse a downstream 3D detector. Importantly, the\nobjects generated by SHIFT3D physically differ from the baseline object yet\nretain a semantically recognizable shape. Our approach provides interpretable\nfailure modes for modern 3D object detectors, and can aid in preemptive\ndiscovery of potential safety risks within 3D perception systems before these\nrisks become critical failures.\n","authors":["Hongge Chen","Zhao Chen","Gregory P. Meyer","Dennis Park","Carl Vondrick","Ashish Shrivastava","Yuning Chai"],"pdf_url":"https://arxiv.org/pdf/2309.05810v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05809v1","updated":"2023-09-11T20:26:40Z","published":"2023-09-11T20:26:40Z","title":"Divergences in Color Perception between Deep Neural Networks and Humans","summary":" Deep neural networks (DNNs) are increasingly proposed as models of human\nvision, bolstered by their impressive performance on image classification and\nobject recognition tasks. Yet, the extent to which DNNs capture fundamental\naspects of human vision such as color perception remains unclear. Here, we\ndevelop novel experiments for evaluating the perceptual coherence of color\nembeddings in DNNs, and we assess how well these algorithms predict human color\nsimilarity judgments collected via an online survey. We find that\nstate-of-the-art DNN architectures $-$ including convolutional neural networks\nand vision transformers $-$ provide color similarity judgments that strikingly\ndiverge from human color judgments of (i) images with controlled color\nproperties, (ii) images generated from online searches, and (iii) real-world\nimages from the canonical CIFAR-10 dataset. We compare DNN performance against\nan interpretable and cognitively plausible model of color perception based on\nwavelet decomposition, inspired by foundational theories in computational\nneuroscience. While one deep learning model $-$ a convolutional DNN trained on\na style transfer task $-$ captures some aspects of human color perception, our\nwavelet algorithm provides more coherent color embeddings that better predict\nhuman color judgments compared to all DNNs we examine. These results hold when\naltering the high-level visual task used to train similar DNN architectures\n(e.g., image classification versus image segmentation), as well as when\nexamining the color embeddings of different layers in a given DNN architecture.\nThese findings break new ground in the effort to analyze the perceptual\nrepresentations of machine learning algorithms and to improve their ability to\nserve as cognitively plausible models of human vision. Implications for machine\nlearning, human perception, and embodied cognition are discussed.\n","authors":["Ethan O. Nadler","Elise Darragh-Ford","Bhargav Srinivasa Desikan","Christian Conaway","Mark Chu","Tasker Hull","Douglas Guilbeault"],"pdf_url":"https://arxiv.org/pdf/2309.05809v1.pdf","comment":"22 pages, 8 figures + SI Appendix; to appear in Cognition"},{"id":"http://arxiv.org/abs/2309.03905v2","updated":"2023-09-11T20:25:16Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v2.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2108.00596v6","updated":"2023-09-11T20:10:55Z","published":"2021-08-02T02:06:33Z","title":"GTNet:Guided Transformer Network for Detecting Human-Object Interactions","summary":" The human-object interaction (HOI) detection task refers to localizing\nhumans, localizing objects, and predicting the interactions between each\nhuman-object pair. HOI is considered one of the fundamental steps in truly\nunderstanding complex visual scenes. For detecting HOI, it is important to\nutilize relative spatial configurations and object semantics to find salient\nspatial regions of images that highlight the interactions between human object\npairs. This issue is addressed by the novel self-attention based guided\ntransformer network, GTNet. GTNet encodes this spatial contextual information\nin human and object visual features via self-attention while achieving state of\nthe art results on both the V-COCO and HICO-DET datasets. Code will be made\navailable online.\n","authors":["A S M Iftekhar","Satish Kumar","R. Austin McEver","Suya You","B. S. Manjunath"],"pdf_url":"https://arxiv.org/pdf/2108.00596v6.pdf","comment":"accepted for presentation in Pattern Recognition and Tracking XXXIV\n at SPIE commerce+ defence Program"},{"id":"http://arxiv.org/abs/2101.09858v2","updated":"2023-09-11T20:10:49Z","published":"2021-01-25T02:31:49Z","title":"Weakly Supervised Learning for Facial Behavior Analysis : A Review","summary":" In the recent years, there has been a shift in facial behavior analysis from\nthe laboratory-controlled conditions to the challenging in-the-wild conditions\ndue to the superior performance of deep learning based approaches for many real\nworld applications.However, the performance of deep learning approaches relies\non the amount of training data. One of the major problems with data acquisition\nis the requirement of annotations for large amount of training data. Labeling\nprocess of huge training data demands lot of human support with strong domain\nexpertise for facial expressions or action units, which is difficult to obtain\nin real-time environments.Moreover, labeling process is highly vulnerable to\nambiguity of expressions or action units, especially for intensities due to the\nbias induced by the domain experts. Therefore, there is an imperative need to\naddress the problem of facial behavior analysis with weak annotations. In this\npaper, we provide a comprehensive review of weakly supervised learning (WSL)\napproaches for facial behavior analysis with both categorical as well as\ndimensional labels along with the challenges and potential research directions\nassociated with it. First, we introduce various types of weak annotations in\nthe context of facial behavior analysis and the corresponding challenges\nassociated with it. We then systematically review the existing state-of-the-art\napproaches and provide a taxonomy of these approaches along with their insights\nand limitations. In addition, widely used data-sets in the reviewed literature\nand the performance of these approaches along with evaluation principles are\nsummarized. Finally, we discuss the remaining challenges and opportunities\nalong with the potential research directions in order to apply facial behavior\nanalysis with weak labels in real life situations.\n","authors":["Gnana Praveen R","Eric Granger","Patrick Cardinal"],"pdf_url":"https://arxiv.org/pdf/2101.09858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05793v1","updated":"2023-09-11T19:59:43Z","published":"2023-09-11T19:59:43Z","title":"PhotoVerse: Tuning-Free Image Customization with Text-to-Image Diffusion\n Models","summary":" Personalized text-to-image generation has emerged as a powerful and\nsought-after tool, empowering users to create customized images based on their\nspecific concepts and prompts. However, existing approaches to personalization\nencounter multiple challenges, including long tuning times, large storage\nrequirements, the necessity for multiple input images per identity, and\nlimitations in preserving identity and editability. To address these obstacles,\nwe present PhotoVerse, an innovative methodology that incorporates a\ndual-branch conditioning mechanism in both text and image domains, providing\neffective control over the image generation process. Furthermore, we introduce\nfacial identity loss as a novel component to enhance the preservation of\nidentity during training. Remarkably, our proposed PhotoVerse eliminates the\nneed for test time tuning and relies solely on a single facial photo of the\ntarget identity, significantly reducing the resource cost associated with image\ngeneration. After a single training phase, our approach enables generating\nhigh-quality images within only a few seconds. Moreover, our method can produce\ndiverse images that encompass various scenes and styles. The extensive\nevaluation demonstrates the superior performance of our approach, which\nachieves the dual objectives of preserving identity and facilitating\neditability. Project page: https://photoverse2d.github.io/\n","authors":["Li Chen","Mengyi Zhao","Yiheng Liu","Mingxu Ding","Yangyang Song","Shizun Wang","Xu Wang","Hao Yang","Jing Liu","Kang Du","Min Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.05793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04001v2","updated":"2023-09-11T19:34:43Z","published":"2023-09-07T20:07:57Z","title":"Multimodal Transformer for Material Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different combinations of\nfour different modalities: RGB, Angle of Linear Polarization (AoLP), Degree of\nLinear Polarization (DoLP) and Near-Infrared (NIR). We also propose a new model\nnamed Multi-Modal Segmentation Transformer (MMSFormer) that incorporates the\nproposed fusion strategy to perform multimodal material segmentation. MMSFormer\nachieves 52.05% mIoU outperforming the current state-of-the-art on Multimodal\nMaterial Segmentation (MCubeS) dataset. For instance, our method provides\nsignificant improvement in detecting gravel (+10.4%) and human (+9.1%) classes.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.05782v1","updated":"2023-09-11T19:29:26Z","published":"2023-09-11T19:29:26Z","title":"Blendshapes GHUM: Real-time Monocular Facial Blendshape Prediction","summary":" We present Blendshapes GHUM, an on-device ML pipeline that predicts 52 facial\nblendshape coefficients at 30+ FPS on modern mobile phones, from a single\nmonocular RGB image and enables facial motion capture applications like virtual\navatars. Our main contributions are: i) an annotation-free offline method for\nobtaining blendshape coefficients from real-world human scans, ii) a\nlightweight real-time model that predicts blendshape coefficients based on\nfacial landmarks.\n","authors":["Ivan Grishchenko","Geng Yan","Eduard Gabriel Bazavan","Andrei Zanfir","Nikolai Chinaev","Karthik Raveendran","Matthias Grundmann","Cristian Sminchisescu"],"pdf_url":"https://arxiv.org/pdf/2309.05782v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.05780v1","updated":"2023-09-11T19:24:40Z","published":"2023-09-11T19:24:40Z","title":"LUNet: Deep Learning for the Segmentation of Arterioles and Venules in\n High Resolution Fundus Images","summary":" The retina is the only part of the human body in which blood vessels can be\naccessed non-invasively using imaging techniques such as digital fundus images\n(DFI). The spatial distribution of the retinal microvasculature may change with\ncardiovascular diseases and thus the eyes may be regarded as a window to our\nhearts. Computerized segmentation of the retinal arterioles and venules (A/V)\nis essential for automated microvasculature analysis. Using active learning, we\ncreated a new DFI dataset containing 240 crowd-sourced manual A/V segmentations\nperformed by fifteen medical students and reviewed by an ophthalmologist, and\ndeveloped LUNet, a novel deep learning architecture for high resolution A/V\nsegmentation. LUNet architecture includes a double dilated convolutional block\nthat aims to enhance the receptive field of the model and reduce its parameter\ncount. Furthermore, LUNet has a long tail that operates at high resolution to\nrefine the segmentation. The custom loss function emphasizes the continuity of\nthe blood vessels. LUNet is shown to significantly outperform two\nstate-of-the-art segmentation algorithms on the local test set as well as on\nfour external test sets simulating distribution shifts across ethnicity,\ncomorbidities, and annotators. We make the newly created dataset open access\n(upon publication).\n","authors":["Jonathan Fhima","Jan Van Eijgen","Hana Kulenovic","Valérie Debeuf","Marie Vangilbergen","Marie-Isaline Billen","Heloïse Brackenier","Moti Freiman","Ingeborg Stalmans","Joachim A. Behar"],"pdf_url":"https://arxiv.org/pdf/2309.05780v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06456v3","updated":"2023-09-11T19:05:13Z","published":"2023-05-10T20:51:37Z","title":"Perpetual Humanoid Control for Real-time Simulated Avatars","summary":" We present a physics-based humanoid controller that achieves high-fidelity\nmotion imitation and fault-tolerant behavior in the presence of noisy input\n(e.g. pose estimates from video or generated from language) and unexpected\nfalls. Our controller scales up to learning ten thousand motion clips without\nusing any external stabilizing forces and learns to naturally recover from\nfail-state. Given reference motion, our controller can perpetually control\nsimulated avatars without requiring resets. At its core, we propose the\nprogressive multiplicative control policy (PMCP), which dynamically allocates\nnew network capacity to learn harder and harder motion sequences. PMCP allows\nefficient scaling for learning from large-scale motion databases and adding new\ntasks, such as fail-state recovery, without catastrophic forgetting. We\ndemonstrate the effectiveness of our controller by using it to imitate noisy\nposes from video-based pose estimators and language-based motion generators in\na live and real-time multi-person avatar use case.\n","authors":["Zhengyi Luo","Jinkun Cao","Alexander Winkler","Kris Kitani","Weipeng Xu"],"pdf_url":"https://arxiv.org/pdf/2305.06456v3.pdf","comment":"ICCV 2023. Project page: https://zhengyiluo.github.io/PHC/"},{"id":"http://arxiv.org/abs/2309.05756v1","updated":"2023-09-11T18:35:14Z","published":"2023-09-11T18:35:14Z","title":"TransferDoc: A Self-Supervised Transferable Document Representation\n Learning Model Unifying Vision and Language","summary":" The field of visual document understanding has witnessed a rapid growth in\nemerging challenges and powerful multi-modal strategies. However, they rely on\nan extensive amount of document data to learn their pretext objectives in a\n``pre-train-then-fine-tune'' paradigm and thus, suffer a significant\nperformance drop in real-world online industrial settings. One major reason is\nthe over-reliance on OCR engines to extract local positional information within\na document page. Therefore, this hinders the model's generalizability,\nflexibility and robustness due to the lack of capturing global information\nwithin a document image. We introduce TransferDoc, a cross-modal\ntransformer-based architecture pre-trained in a self-supervised fashion using\nthree novel pretext objectives. TransferDoc learns richer semantic concepts by\nunifying language and visual representations, which enables the production of\nmore transferable models. Besides, two novel downstream tasks have been\nintroduced for a ``closer-to-real'' industrial evaluation scenario where\nTransferDoc outperforms other state-of-the-art approaches.\n","authors":["Souhail Bakkali","Sanket Biswas","Zuheng Ming","Mickael Coustaty","Marçal Rusiñol","Oriol Ramos Terrades","Josep Lladós"],"pdf_url":"https://arxiv.org/pdf/2309.05756v1.pdf","comment":"Preprint to Pattern Recognition"},{"id":"http://arxiv.org/abs/2309.05747v1","updated":"2023-09-11T18:11:38Z","published":"2023-09-11T18:11:38Z","title":"Evaluating the Reliability of CNN Models on Classifying Traffic and Road\n Signs using LIME","summary":" The objective of this investigation is to evaluate and contrast the\neffectiveness of four state-of-the-art pre-trained models, ResNet-34, VGG-19,\nDenseNet-121, and Inception V3, in classifying traffic and road signs with the\nutilization of the GTSRB public dataset. The study focuses on evaluating the\naccuracy of these models' predictions as well as their ability to employ\nappropriate features for image categorization. To gain insights into the\nstrengths and limitations of the model's predictions, the study employs the\nlocal interpretable model-agnostic explanations (LIME) framework. The findings\nof this experiment indicate that LIME is a crucial tool for improving the\ninterpretability and dependability of machine learning models for image\nidentification, regardless of the models achieving an f1 score of 0.99 on\nclassifying traffic and road signs. The conclusion of this study has important\nramifications for how these models are used in practice, as it is crucial to\nensure that model predictions are founded on the pertinent image features.\n","authors":["Md. Atiqur Rahman","Ahmed Saad Tanim","Sanjid Islam","Fahim Pranto","G. M. Shahariar","Md. Tanvir Rouf Shawon"],"pdf_url":"https://arxiv.org/pdf/2309.05747v1.pdf","comment":"Accepted for publication in the 2nd International Conference on Big\n Data, IoT and Machine Learning (BIM 2023), 16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2211.06560v3","updated":"2023-09-11T17:00:56Z","published":"2022-11-12T03:49:22Z","title":"PatchRefineNet: Improving Binary Segmentation by Incorporating Signals\n from Optimal Patch-wise Binarization","summary":" The purpose of binary segmentation models is to determine which pixels belong\nto an object of interest (e.g., which pixels in an image are part of roads).\nThe models assign a logit score (i.e., probability) to each pixel and these are\nconverted into predictions by thresholding (i.e., each pixel with logit score\n$\\geq \\tau$ is predicted to be part of a road). However, a common phenomenon in\ncurrent and former state-of-the-art segmentation models is spatial bias -- in\nsome patches, the logit scores are consistently biased upwards and in others\nthey are consistently biased downwards. These biases cause false positives and\nfalse negatives in the final predictions. In this paper, we propose\nPatchRefineNet (PRN), a small network that sits on top of a base segmentation\nmodel and learns to correct its patch-specific biases. Across a wide variety of\nbase models, PRN consistently helps them improve mIoU by 2-3\\%. One of the key\nideas behind PRN is the addition of a novel supervision signal during training.\nGiven the logit scores produced by the base segmentation model, each pixel is\ngiven a pseudo-label that is obtained by optimally thresholding the logit\nscores in each image patch. Incorporating these pseudo-labels into the loss\nfunction of PRN helps correct systematic biases and reduce false\npositives/negatives. Although we mainly focus on binary segmentation, we also\nshow how PRN can be extended to saliency detection and few-shot segmentation.\nWe also discuss how the ideas can be extended to multiclass segmentation.\n","authors":["Savinay Nagendra","Chaopeng Shen","Daniel Kifer"],"pdf_url":"https://arxiv.org/pdf/2211.06560v3.pdf","comment":"16 pages, 12 figures, 7 tables (Added supplementary material)"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.05537v1","updated":"2023-09-11T15:19:57Z","published":"2023-09-11T15:19:57Z","title":"D2WFP: A Novel Protocol for Forensically Identifying, Extracting, and\n Analysing Deep and Dark Web Browsing Activities","summary":" The use of the un-indexed web, commonly known as the deep web and dark web,\nto commit or facilitate criminal activity has drastically increased over the\npast decade. The dark web is an in-famously dangerous place where all kinds of\ncriminal activities take place [1-2], despite advances in web forensics\ntechniques, tools, and methodologies, few studies have formally tackled the\ndark and deep web forensics and the technical differences in terms of\ninvestigative techniques and artefacts identification and extraction. This\nresearch proposes a novel and comprehensive protocol to guide and assist\ndigital forensics professionals in investigating crimes committed on or via the\ndeep and dark web, The protocol named D2WFP establishes a new sequential\napproach for performing investigative activities by observing the order of\nvolatility and implementing a systemic approach covering all browsing related\nhives and artefacts which ultimately resulted into improv-ing the accuracy and\neffectiveness. Rigorous quantitative and qualitative research has been\nconducted by assessing D2WFP following a scientifically-sound and comprehensive\nprocess in different scenarios and the obtained results show an apparent\nincrease in the number of artefacts re-covered when adopting D2WFP which\noutperform any current industry or opensource browsing forensics tools. The\nsecond contribution of D2WFP is the robust formulation of artefact correlation\nand cross-validation within D2WFP which enables digital forensics professionals\nto better document and structure their analysis of host-based deep and dark web\nbrowsing artefacts.\n","authors":["Mohamed Chahine Ghanem","Patrick Mulvihill","Karim Ouazzane","Ramzi Djemai","Dipo Dunsin"],"pdf_url":"https://arxiv.org/pdf/2309.05537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05521v1","updated":"2023-09-11T15:04:46Z","published":"2023-09-11T15:04:46Z","title":"Re-formalization of Individual Fairness","summary":" The notion of individual fairness is a formalization of an ethical principle,\n\"Treating like cases alike,\" which has been argued such as by Aristotle. In a\nfairness-aware machine learning context, Dwork et al. firstly formalized the\nnotion. In their formalization, a similar pair of data in an unfair space\nshould be mapped to similar positions in a fair space. We propose to\nre-formalize individual fairness by the statistical independence conditioned by\nindividuals. This re-formalization has the following merits. First, our\nformalization is compatible with that of Dwork et al. Second, our formalization\nenables to combine individual fairness with the fairness notion, equalized odds\nor sufficiency, as well as statistical parity. Third, though their\nformalization implicitly assumes a pre-process approach for making fair\nprediction, our formalization is applicable to an in-process or post-process\napproach.\n","authors":["Toshihiro Kamishima"],"pdf_url":"https://arxiv.org/pdf/2309.05521v1.pdf","comment":"Published at the 6th FAccTRec Workshop: Responsible Recommendation"},{"id":"http://arxiv.org/abs/2309.05438v1","updated":"2023-09-11T13:21:26Z","published":"2023-09-11T13:21:26Z","title":"Towards Content-based Pixel Retrieval in Revisited Oxford and Paris","summary":" This paper introduces the first two pixel retrieval benchmarks. Pixel\nretrieval is segmented instance retrieval. Like semantic segmentation extends\nclassification to the pixel level, pixel retrieval is an extension of image\nretrieval and offers information about which pixels are related to the query\nobject. In addition to retrieving images for the given query, it helps users\nquickly identify the query object in true positive images and exclude false\npositive images by denoting the correlated pixels. Our user study results show\npixel-level annotation can significantly improve the user experience.\n Compared with semantic and instance segmentation, pixel retrieval requires a\nfine-grained recognition capability for variable-granularity targets. To this\nend, we propose pixel retrieval benchmarks named PROxford and PRParis, which\nare based on the widely used image retrieval datasets, ROxford and RParis.\nThree professional annotators label 5,942 images with two rounds of\ndouble-checking and refinement. Furthermore, we conduct extensive experiments\nand analysis on the SOTA methods in image search, image matching, detection,\nsegmentation, and dense matching using our pixel retrieval benchmarks. Results\nshow that the pixel retrieval task is challenging to these approaches and\ndistinctive from existing problems, suggesting that further research can\nadvance the content-based pixel-retrieval and thus user search experience. The\ndatasets can be downloaded from\n\\href{https://github.com/anguoyuan/Pixel_retrieval-Segmented_instance_retrieval}{this\nlink}.\n","authors":["Guoyuan An","Woo Jae Kim","Saelyne Yang","Rong Li","Yuchi Huo","Sung-Eui Yoon"],"pdf_url":"https://arxiv.org/pdf/2309.05438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05273v1","updated":"2023-09-11T07:12:17Z","published":"2023-09-11T07:12:17Z","title":"Formalizing Multimedia Recommendation through Multimodal Deep Learning","summary":" Recommender systems (RSs) offer personalized navigation experiences on online\nplatforms, but recommendation remains a challenging task, particularly in\nspecific scenarios and domains. Multimodality can help tap into richer\ninformation sources and construct more refined user/item profiles for\nrecommendations. However, existing literature lacks a shared and universal\nschema for modeling and solving the recommendation problem through the lens of\nmultimodality. This work aims to formalize a general multimodal schema for\nmultimedia recommendation. It provides a comprehensive literature review of\nmultimodal approaches for multimedia recommendation from the last eight years,\noutlines the theoretical foundations of a multimodal pipeline, and demonstrates\nits rationale by applying it to selected state-of-the-art approaches. The work\nalso conducts a benchmarking analysis of recent algorithms for multimedia\nrecommendation within Elliot, a rigorous framework for evaluating recommender\nsystems. The main aim is to provide guidelines for designing and implementing\nthe next generation of multimodal approaches in multimedia recommendation.\n","authors":["Daniele Malitesta","Giandomenico Cornacchia","Claudio Pomo","Felice Antonio Merra","Tommaso Di Noia","Eugenio Di Sciascio"],"pdf_url":"https://arxiv.org/pdf/2309.05273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13032v2","updated":"2023-09-11T05:39:05Z","published":"2023-08-24T18:58:10Z","title":"Financial News Analytics Using Fine-Tuned Llama 2 GPT Model","summary":" The paper considers the possibility to fine-tune Llama 2 GPT large language\nmodel (LLM) for the multitask analysis of financial news. For fine-tuning, the\nPEFT/LoRA based approach was used. In the study, the model was fine-tuned for\nthe following tasks: analysing a text from financial market perspectives,\nhighlighting main points of a text, summarizing a text and extracting named\nentities with appropriate sentiments. The obtained results show that the\nfine-tuned Llama 2 model can perform a multitask financial news analysis with a\nspecified structure of response, part of response can be a structured text and\nanother part of data can have JSON format for further processing. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models with quantitative target variables.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2308.13032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05238v1","updated":"2023-09-11T05:12:14Z","published":"2023-09-11T05:12:14Z","title":"Generating Natural Language Queries for More Effective Systematic Review\n Screening Prioritisation","summary":" Screening prioritisation in medical systematic reviews aims to rank the set\nof documents retrieved by complex Boolean queries. The goal is to prioritise\nthe most important documents so that subsequent review steps can be carried out\nmore efficiently and effectively. The current state of the art uses the final\ntitle of the review to rank documents using BERT-based neural neural rankers.\nHowever, the final title is only formulated at the end of the review process,\nwhich makes this approach impractical as it relies on ex post facto\ninformation. At the time of screening, only a rough working title is available,\nwith which the BERT-based ranker achieves is significantly worse than the final\ntitle. In this paper, we explore alternative sources of queries for screening\nprioritisation, such as the Boolean query used to retrieve the set of documents\nto be screened, and queries generated by instruction-based generative large\nlanguage models such as ChatGPT and Alpaca. Our best approach is not only\npractical based on the information available at screening time, but is similar\nin effectiveness with the final title.\n","authors":["Shuai Wang","Harrisen Scells","Martin Potthast","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2309.05238v1.pdf","comment":"Preprints for Accepted paper in SIGIR-AP-2023"},{"id":"http://arxiv.org/abs/2309.05871v1","updated":"2023-09-11T23:39:13Z","published":"2023-09-11T23:39:13Z","title":"Generalized Rainbow Differential Privacy","summary":" We study a new framework for designing differentially private (DP) mechanisms\nvia randomized graph colorings, called rainbow differential privacy. In this\nframework, datasets are nodes in a graph, and two neighboring datasets are\nconnected by an edge. Each dataset in the graph has a preferential ordering for\nthe possible outputs of the mechanism, and these orderings are called rainbows.\nDifferent rainbows partition the graph of connected datasets into different\nregions. We show that if a DP mechanism at the boundary of such regions is\nfixed and it behaves identically for all same-rainbow boundary datasets, then a\nunique optimal $(\\epsilon,\\delta)$-DP mechanism exists (as long as the boundary\ncondition is valid) and can be expressed in closed-form. Our proof technique is\nbased on an interesting relationship between dominance ordering and DP, which\napplies to any finite number of colors and for $(\\epsilon,\\delta)$-DP,\nimproving upon previous results that only apply to at most three colors and for\n$\\epsilon$-DP. We justify the homogeneous boundary condition assumption by\ngiving an example with non-homogeneous boundary condition, for which there\nexists no optimal DP mechanism.\n","authors":["Yuzhou Gu","Ziqi Zhou","Onur Günlü","Rafael G. L. D'Oliveira","Parastoo Sadeghi","Muriel Médard","Rafael F. Schaefer"],"pdf_url":"https://arxiv.org/pdf/2309.05871v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2202.03974"},{"id":"http://arxiv.org/abs/2309.05786v1","updated":"2023-09-11T19:32:30Z","published":"2023-09-11T19:32:30Z","title":"Stringesthesia: Dynamically Shifting Musical Agency Between Audience and\n Performer Based on Trust in an Interactive and Improvised Performance","summary":" This paper introduces Stringesthesia, an interactive and improvised\nperformance paradigm. Stringesthesia uses real-time neuroimaging to connect\nperformers and audiences, enabling direct access to the performers mental state\nand determining audience participation during the performance. Functional\nnear-infrared spectroscopy, or fNIRS, a noninvasive neuroimaging tool, was used\nto assess metabolic activity of brain areas collectively associated with a\nmetric we call trust. A visualization representing the real-time measurement of\nthe performers level of trust was projected behind the performer and used to\ndynamically restrict or promote audience participation. Throughout the paper we\ndiscuss prior work that heavily influenced our design, conceptual and\nmethodological issues with using fNIRS technology, system architecture, and\nfeedback from the audience and performer.\n","authors":["Torin Hopkins","Emily Doherty","Netta Ofer","Suibi Che Chuan Weng","Peter Gyrory","Chad Tobin","Leanne Hirshfield","Ellen Yi-Luen Do"],"pdf_url":"https://arxiv.org/pdf/2309.05786v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.05665v1","updated":"2023-09-11T17:59:17Z","published":"2023-09-11T17:59:17Z","title":"Robot Parkour Learning","summary":" Parkour is a grand challenge for legged locomotion that requires robots to\novercome various obstacles rapidly in complex environments. Existing methods\ncan generate either diverse but blind locomotion skills or vision-based but\nspecialized skills by using reference animal data or complex rewards. However,\nautonomous parkour requires robots to learn generalizable skills that are both\nvision-based and diverse to perceive and react to various scenarios. In this\nwork, we propose a system for learning a single end-to-end vision-based parkour\npolicy of diverse parkour skills using a simple reward without any reference\nmotion data. We develop a reinforcement learning method inspired by direct\ncollocation to generate parkour skills, including climbing over high obstacles,\nleaping over large gaps, crawling beneath low barriers, squeezing through thin\nslits, and running. We distill these skills into a single vision-based parkour\npolicy and transfer it to a quadrupedal robot using its egocentric depth\ncamera. We demonstrate that our system can empower two different low-cost\nrobots to autonomously select and execute appropriate parkour skills to\ntraverse challenging real-world environments.\n","authors":["Ziwen Zhuang","Zipeng Fu","Jianren Wang","Christopher Atkeson","Soeren Schwertfeger","Chelsea Finn","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05665v1.pdf","comment":"CoRL 2023 (Oral). Project website at https://robot-parkour.github.io"},{"id":"http://arxiv.org/abs/2309.05660v1","updated":"2023-09-11T17:56:57Z","published":"2023-09-11T17:56:57Z","title":"Hypothesis Search: Inductive Reasoning with Language Models","summary":" Inductive reasoning is a core problem-solving capacity: humans can identify\nunderlying principles from a few examples, which can then be robustly\ngeneralized to novel scenarios. Recent work has evaluated large language models\n(LLMs) on inductive reasoning tasks by directly prompting them yielding \"in\ncontext learning.\" This can work well for straightforward inductive tasks, but\nperforms very poorly on more complex tasks such as the Abstraction and\nReasoning Corpus (ARC). In this work, we propose to improve the inductive\nreasoning ability of LLMs by generating explicit hypotheses at multiple levels\nof abstraction: we prompt the LLM to propose multiple abstract hypotheses about\nthe problem, in natural language, then implement the natural language\nhypotheses as concrete Python programs. These programs can be directly verified\nby running on the observed examples and generalized to novel inputs. Because of\nthe prohibitive cost of generation with state-of-the-art LLMs, we consider a\nmiddle step to filter the set of hypotheses that will be implemented into\nprograms: we either ask the LLM to summarize into a smaller set of hypotheses,\nor ask human annotators to select a subset of the hypotheses. We verify our\npipeline's effectiveness on the ARC visual inductive reasoning benchmark, its\nvariant 1D-ARC, and string transformation dataset SyGuS. On a random 40-problem\nsubset of ARC, our automated pipeline using LLM summaries achieves 27.5%\naccuracy, significantly outperforming the direct prompting baseline (accuracy\nof 12.5%). With the minimal human input of selecting from LLM-generated\ncandidates, the performance is boosted to 37.5%. (And we argue this is a lower\nbound on the performance of our approach without filtering.) Our ablation\nstudies show that abstract hypothesis generation and concrete program\nrepresentations are both beneficial for LLMs to perform inductive reasoning\ntasks.\n","authors":["Ruocheng Wang","Eric Zelikman","Gabriel Poesia","Yewen Pu","Nick Haber","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2309.05660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04940v2","updated":"2023-09-11T17:56:10Z","published":"2023-05-08T16:47:28Z","title":"The EarlyBIRD Catches the Bug: On Exploiting Early Layers of Encoder\n Models for More Efficient Code Classification","summary":" The use of modern Natural Language Processing (NLP) techniques has shown to\nbe beneficial for software engineering tasks, such as vulnerability detection\nand type inference. However, training deep NLP models requires significant\ncomputational resources. This paper explores techniques that aim at achieving\nthe best usage of resources and available information in these models.\n We propose a generic approach, EarlyBIRD, to build composite representations\nof code from the early layers of a pre-trained transformer model. We\nempirically investigate the viability of this approach on the CodeBERT model by\ncomparing the performance of 12 strategies for creating composite\nrepresentations with the standard practice of only using the last encoder\nlayer.\n Our evaluation on four datasets shows that several early layer combinations\nyield better performance on defect detection, and some combinations improve\nmulti-class classification. More specifically, we obtain a +2 average\nimprovement of detection accuracy on Devign with only 3 out of 12 layers of\nCodeBERT and a 3.3x speed-up of fine-tuning. These findings show that early\nlayers can be used to obtain better results using the same resources, as well\nas to reduce resource usage during fine-tuning and inference.\n","authors":["Anastasiia Grishina","Max Hort","Leon Moonen"],"pdf_url":"https://arxiv.org/pdf/2305.04940v2.pdf","comment":"The content in this pre-print is the same as in the CRC accepted for\n publication in the ACM Joint European Software Engineering Conference and\n Symposium on the Foundations of Software Engineering (ESEC/FSE 2023)"},{"id":"http://arxiv.org/abs/2306.12001v5","updated":"2023-09-11T17:53:43Z","published":"2023-06-21T03:35:06Z","title":"An Overview of Catastrophic AI Risks","summary":" Rapid advancements in artificial intelligence (AI) have sparked growing\nconcerns among experts, policymakers, and world leaders regarding the potential\nfor increasingly advanced AI systems to pose catastrophic risks. Although\nnumerous risks have been detailed separately, there is a pressing need for a\nsystematic discussion and illustration of the potential dangers to better\ninform efforts to mitigate them. This paper provides an overview of the main\nsources of catastrophic AI risks, which we organize into four categories:\nmalicious use, in which individuals or groups intentionally use AIs to cause\nharm; AI race, in which competitive environments compel actors to deploy unsafe\nAIs or cede control to AIs; organizational risks, highlighting how human\nfactors and complex systems can increase the chances of catastrophic accidents;\nand rogue AIs, describing the inherent difficulty in controlling agents far\nmore intelligent than humans. For each category of risk, we describe specific\nhazards, present illustrative stories, envision ideal scenarios, and propose\npractical suggestions for mitigating these dangers. Our goal is to foster a\ncomprehensive understanding of these risks and inspire collective and proactive\nefforts to ensure that AIs are developed and deployed in a safe manner.\nUltimately, we hope this will allow us to realize the benefits of this powerful\ntechnology while minimizing the potential for catastrophic outcomes.\n","authors":["Dan Hendrycks","Mantas Mazeika","Thomas Woodside"],"pdf_url":"https://arxiv.org/pdf/2306.12001v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05657v1","updated":"2023-09-11T17:52:28Z","published":"2023-09-11T17:52:28Z","title":"On the quality of randomized approximations of Tukey's depth","summary":" Tukey's depth (or halfspace depth) is a widely used measure of centrality for\nmultivariate data. However, exact computation of Tukey's depth is known to be a\nhard problem in high dimensions. As a remedy, randomized approximations of\nTukey's depth have been proposed. In this paper we explore when such randomized\nalgorithms return a good approximation of Tukey's depth. We study the case when\nthe data are sampled from a log-concave isotropic distribution. We prove that,\nif one requires that the algorithm runs in polynomial time in the dimension,\nthe randomized algorithm correctly approximates the maximal depth $1/2$ and\ndepths close to zero. On the other hand, for any point of intermediate depth,\nany good approximation requires exponential complexity.\n","authors":["Simon Briend","Gábor Lugosi","Roberto Imbuzeiro Oliveira"],"pdf_url":"https://arxiv.org/pdf/2309.05657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05655v1","updated":"2023-09-11T17:49:25Z","published":"2023-09-11T17:49:25Z","title":"Dynamic Handover: Throw and Catch with Bimanual Hands","summary":" Humans throw and catch objects all the time. However, such a seemingly common\nskill introduces a lot of challenges for robots to achieve: The robots need to\noperate such dynamic actions at high-speed, collaborate precisely, and interact\nwith diverse objects. In this paper, we design a system with two multi-finger\nhands attached to robot arms to solve this problem. We train our system using\nMulti-Agent Reinforcement Learning in simulation and perform Sim2Real transfer\nto deploy on the real robots. To overcome the Sim2Real gap, we provide multiple\nnovel algorithm designs including learning a trajectory prediction model for\nthe object. Such a model can help the robot catcher has a real-time estimation\nof where the object will be heading, and then react accordingly. We conduct our\nexperiments with multiple objects in the real-world system, and show\nsignificant improvements over multiple baselines. Our project page is available\nat \\url{https://binghao-huang.github.io/dynamic_handover/}.\n","authors":["Binghao Huang","Yuanpei Chen","Tianyu Wang","Yuzhe Qin","Yaodong Yang","Nikolay Atanasov","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05655v1.pdf","comment":"Accepted at CoRL 2023.\n https://binghao-huang.github.io/dynamic_handover/"},{"id":"http://arxiv.org/abs/2309.05646v1","updated":"2023-09-11T17:37:35Z","published":"2023-09-11T17:37:35Z","title":"A Novel Supervised Deep Learning Solution to Detect Distributed Denial\n of Service (DDoS) attacks on Edge Systems using Convolutional Neural Networks\n (CNN)","summary":" Cybersecurity attacks are becoming increasingly sophisticated and pose a\ngrowing threat to individuals, and private and public sectors. Distributed\nDenial of Service attacks are one of the most harmful of these threats in\ntoday's internet, disrupting the availability of essential services. This\nproject presents a novel deep learning-based approach for detecting DDoS\nattacks in network traffic using the industry-recognized DDoS evaluation\ndataset from the University of New Brunswick, which contains packet captures\nfrom real-time DDoS attacks, creating a broader and more applicable model for\nthe real world. The algorithm employed in this study exploits the properties of\nConvolutional Neural Networks (CNN) and common deep learning algorithms to\nbuild a novel mitigation technique that classifies benign and malicious\ntraffic. The proposed model preprocesses the data by extracting packet flows\nand normalizing them to a fixed length which is fed into a custom architecture\ncontaining layers regulating node dropout, normalization, and a sigmoid\nactivation function to out a binary classification. This allows for the model\nto process the flows effectively and look for the nodes that contribute to DDoS\nattacks while dropping the \"noise\" or the distractors. The results of this\nstudy demonstrate the effectiveness of the proposed algorithm in detecting DDOS\nattacks, achieving an accuracy of .9883 on 2000 unseen flows in network\ntraffic, while being scalable for any network environment.\n","authors":["Vedanth Ramanathan","Krish Mahadevan","Sejal Dua"],"pdf_url":"https://arxiv.org/pdf/2309.05646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15217v2","updated":"2023-09-11T17:25:24Z","published":"2023-07-27T22:29:25Z","title":"Open Problems and Fundamental Limitations of Reinforcement Learning from\n Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) is a technique for training\nAI systems to align with human goals. RLHF has emerged as the central method\nused to finetune state-of-the-art large language models (LLMs). Despite this\npopularity, there has been relatively little public work systematizing its\nflaws. In this paper, we (1) survey open problems and fundamental limitations\nof RLHF and related methods; (2) overview techniques to understand, improve,\nand complement RLHF in practice; and (3) propose auditing and disclosure\nstandards to improve societal oversight of RLHF systems. Our work emphasizes\nthe limitations of RLHF and highlights the importance of a multi-faceted\napproach to the development of safer AI systems.\n","authors":["Stephen Casper","Xander Davies","Claudia Shi","Thomas Krendl Gilbert","Jérémy Scheurer","Javier Rando","Rachel Freedman","Tomasz Korbak","David Lindner","Pedro Freire","Tony Wang","Samuel Marks","Charbel-Raphaël Segerie","Micah Carroll","Andi Peng","Phillip Christoffersen","Mehul Damani","Stewart Slocum","Usman Anwar","Anand Siththaranjan","Max Nadeau","Eric J. Michaud","Jacob Pfau","Dmitrii Krasheninnikov","Xin Chen","Lauro Langosco","Peter Hase","Erdem Bıyık","Anca Dragan","David Krueger","Dorsa Sadigh","Dylan Hadfield-Menell"],"pdf_url":"https://arxiv.org/pdf/2307.15217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05630v1","updated":"2023-09-11T17:19:07Z","published":"2023-09-11T17:19:07Z","title":"Boundary Peeling: Outlier Detection Method Using One-Class Peeling","summary":" Unsupervised outlier detection constitutes a crucial phase within data\nanalysis and remains a dynamic realm of research. A good outlier detection\nalgorithm should be computationally efficient, robust to tuning parameter\nselection, and perform consistently well across diverse underlying data\ndistributions. We introduce One-Class Boundary Peeling, an unsupervised outlier\ndetection algorithm. One-class Boundary Peeling uses the average signed\ndistance from iteratively-peeled, flexible boundaries generated by one-class\nsupport vector machines. One-class Boundary Peeling has robust hyperparameter\nsettings and, for increased flexibility, can be cast as an ensemble method. In\nsynthetic data simulations One-Class Boundary Peeling outperforms all state of\nthe art methods when no outliers are present while maintaining comparable or\nsuperior performance in the presence of outliers, as compared to benchmark\nmethods. One-Class Boundary Peeling performs competitively in terms of correct\nclassification, AUC, and processing time using common benchmark data sets.\n","authors":["Sheikh Arafat","Na Sun","Maria L. Weese","Waldyn G. Martinez"],"pdf_url":"https://arxiv.org/pdf/2309.05630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05102v3","updated":"2023-09-11T17:09:03Z","published":"2022-12-09T20:03:59Z","title":"A soft nearest-neighbor framework for continual semi-supervised learning","summary":" Despite significant advances, the performance of state-of-the-art continual\nlearning approaches hinges on the unrealistic scenario of fully labeled data.\nIn this paper, we tackle this challenge and propose an approach for continual\nsemi-supervised learning--a setting where not all the data samples are labeled.\nA primary issue in this scenario is the model forgetting representations of\nunlabeled data and overfitting the labeled samples. We leverage the power of\nnearest-neighbor classifiers to nonlinearly partition the feature space and\nflexibly model the underlying data distribution thanks to its non-parametric\nnature. This enables the model to learn a strong representation for the current\ntask, and distill relevant information from previous tasks. We perform a\nthorough experimental evaluation and show that our method outperforms all the\nexisting approaches by large margins, setting a solid state of the art on the\ncontinual semi-supervised learning paradigm. For example, on CIFAR-100 we\nsurpass several others even when using at least 30 times less supervision (0.8%\nvs. 25% of annotations). Finally, our method works well on both low and high\nresolution images and scales seamlessly to more complex datasets such as\nImageNet-100. The code is publicly available on\nhttps://github.com/kangzhiq/NNCSL\n","authors":["Zhiqi Kang","Enrico Fini","Moin Nabi","Elisa Ricci","Karteek Alahari"],"pdf_url":"https://arxiv.org/pdf/2212.05102v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2307.12586v2","updated":"2023-09-11T17:08:05Z","published":"2023-07-24T07:58:18Z","title":"InVAErt networks: a data-driven framework for model synthesis and\n identifiability analysis","summary":" Use of generative models and deep learning for physics-based systems is\ncurrently dominated by the task of emulation. However, the remarkable\nflexibility offered by data-driven architectures would suggest to extend this\nrepresentation to other aspects of system synthesis including model inversion\nand identifiability. We introduce inVAErt (pronounced \"invert\") networks, a\ncomprehensive framework for data-driven analysis and synthesis of parametric\nphysical systems which uses a deterministic encoder and decoder to represent\nthe forward and inverse solution maps, a normalizing flow to capture the\nprobabilistic distribution of system outputs, and a variational encoder\ndesigned to learn a compact latent representation for the lack of bijectivity\nbetween inputs and outputs. We formally investigate the selection of penalty\ncoefficients in the loss function and strategies for latent space sampling,\nsince we find that these significantly affect both training and testing\nperformance. We validate our framework through extensive numerical examples,\nincluding simple linear, nonlinear, and periodic maps, dynamical systems, and\nspatio-temporal PDEs.\n","authors":["Guoxiang Grayson Tong","Carlos A. Sing Long","Daniele E. Schiavazzi"],"pdf_url":"https://arxiv.org/pdf/2307.12586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01833v2","updated":"2023-09-11T17:02:33Z","published":"2022-12-04T14:50:22Z","title":"Understanding Sinusoidal Neural Networks","summary":" In this work, we investigate the structure and representation capacity of\nsinusoidal MLPs - multilayer perceptron networks that use sine as the\nactivation function. These neural networks (known as neural fields) have become\nfundamental in representing common signals in computer graphics, such as\nimages, signed distance functions, and radiance fields. This success can be\nprimarily attributed to two key properties of sinusoidal MLPs: smoothness and\ncompactness. These functions are smooth because they arise from the composition\nof affine maps with the sine function. This work provides theoretical results\nto justify the compactness property of sinusoidal MLPs and provides control\nmechanisms in the definition and training of these networks.\n We propose to study a sinusoidal MLP by expanding it as a harmonic sum.\nFirst, we observe that its first layer can be seen as a harmonic dictionary,\nwhich we call the input sinusoidal neurons. Then, a hidden layer combines this\ndictionary using an affine map and modulates the outputs using the sine, this\nresults in a special dictionary of sinusoidal neurons. We prove that each of\nthese sinusoidal neurons expands as a harmonic sum producing a large number of\nnew frequencies expressed as integer linear combinations of the input\nfrequencies. Thus, each hidden neuron produces the same frequencies, and the\ncorresponding amplitudes are completely determined by the hidden affine map. We\nalso provide an upper bound and a way of sorting these amplitudes that can\ncontrol the resulting approximation, allowing us to truncate the corresponding\nseries. Finally, we present applications for training and initialization of\nsinusoidal MLPs. Additionally, we show that if the input neurons are periodic,\nthen the entire network will be periodic with the same period. We relate these\nperiodic networks with the Fourier series representation.\n","authors":["Tiago Novello"],"pdf_url":"https://arxiv.org/pdf/2212.01833v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.17546v3","updated":"2023-09-11T16:58:48Z","published":"2022-10-31T17:57:55Z","title":"Preventing Verbatim Memorization in Language Models Gives a False Sense\n of Privacy","summary":" Studying data memorization in neural language models helps us understand the\nrisks (e.g., to privacy or copyright) associated with models regurgitating\ntraining data and aids in the development of countermeasures. Many prior works\n-- and some recently deployed defenses -- focus on \"verbatim memorization\",\ndefined as a model generation that exactly matches a substring from the\ntraining set. We argue that verbatim memorization definitions are too\nrestrictive and fail to capture more subtle forms of memorization.\nSpecifically, we design and implement an efficient defense that perfectly\nprevents all verbatim memorization. And yet, we demonstrate that this \"perfect\"\nfilter does not prevent the leakage of training data. Indeed, it is easily\ncircumvented by plausible and minimally modified \"style-transfer\" prompts --\nand in some cases even the non-modified original prompts -- to extract\nmemorized information. We conclude by discussing potential alternative\ndefinitions and why defining memorization is a difficult yet crucial open\nquestion for neural language models.\n","authors":["Daphne Ippolito","Florian Tramèr","Milad Nasr","Chiyuan Zhang","Matthew Jagielski","Katherine Lee","Christopher A. Choquette-Choo","Nicholas Carlini"],"pdf_url":"https://arxiv.org/pdf/2210.17546v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05610v1","updated":"2023-09-11T16:49:05Z","published":"2023-09-11T16:49:05Z","title":"Privacy Side Channels in Machine Learning Systems","summary":" Most current approaches for protecting privacy in machine learning (ML)\nassume that models exist in a vacuum, when in reality, ML models are part of\nlarger systems that include components for training data filtering, output\nmonitoring, and more. In this work, we introduce privacy side channels: attacks\nthat exploit these system-level components to extract private information at\nfar higher rates than is otherwise possible for standalone models. We propose\nfour categories of side channels that span the entire ML lifecycle (training\ndata filtering, input preprocessing, output post-processing, and query\nfiltering) and allow for either enhanced membership inference attacks or even\nnovel threats such as extracting users' test queries. For example, we show that\ndeduplicating training data before applying differentially-private training\ncreates a side-channel that completely invalidates any provable privacy\nguarantees. Moreover, we show that systems which block language models from\nregenerating training data can be exploited to allow exact reconstruction of\nprivate keys contained in the training set -- even if the model did not\nmemorize these keys. Taken together, our results demonstrate the need for a\nholistic, end-to-end privacy analysis of machine learning.\n","authors":["Edoardo Debenedetti","Giorgio Severi","Nicholas Carlini","Christopher A. Choquette-Choo","Matthew Jagielski","Milad Nasr","Eric Wallace","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2309.05610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05605v1","updated":"2023-09-11T16:39:30Z","published":"2023-09-11T16:39:30Z","title":"Memory Injections: Correcting Multi-Hop Reasoning Failures during\n Inference in Transformer-Based Language Models","summary":" Answering multi-hop reasoning questions requires retrieving and synthesizing\ninformation from diverse sources. Large Language Models (LLMs) struggle to\nperform such reasoning consistently. Here we propose an approach to pinpoint\nand rectify multi-hop reasoning failures through targeted memory injections on\nLLM attention heads. First, we analyze the per-layer activations of GPT-2\nmodels in response to single and multi-hop prompts. We then propose a mechanism\nthat allows users to inject pertinent prompt-specific information, which we\nrefer to as \"memories,\" at critical LLM locations during inference. By thus\nenabling the LLM to incorporate additional relevant information during\ninference, we enhance the quality of multi-hop prompt completions. We show\nempirically that a simple, efficient, and targeted memory injection into a key\nattention layer can often increase the probability of the desired next token in\nmulti-hop tasks, by up to 424%.\n","authors":["Mansi Sakarvadia","Aswathy Ajith","Arham Khan","Daniel Grzenda","Nathaniel Hudson","André Bauer","Kyle Chard","Ian Foster"],"pdf_url":"https://arxiv.org/pdf/2309.05605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.03605v7","updated":"2023-09-11T16:31:55Z","published":"2021-10-07T16:33:11Z","title":"Robust Feature-Level Adversaries are Interpretability Tools","summary":" The literature on adversarial attacks in computer vision typically focuses on\npixel-level perturbations. These tend to be very difficult to interpret. Recent\nwork that manipulates the latent representations of image generators to create\n\"feature-level\" adversarial perturbations gives us an opportunity to explore\nperceptible, interpretable adversarial attacks. We make three contributions.\nFirst, we observe that feature-level attacks provide useful classes of inputs\nfor studying representations in models. Second, we show that these adversaries\nare uniquely versatile and highly robust. We demonstrate that they can be used\nto produce targeted, universal, disguised, physically-realizable, and black-box\nattacks at the ImageNet scale. Third, we show how these adversarial images can\nbe used as a practical interpretability tool for identifying bugs in networks.\nWe use these adversaries to make predictions about spurious associations\nbetween features and classes which we then test by designing \"copy/paste\"\nattacks in which one natural image is pasted into another to cause a targeted\nmisclassification. Our results suggest that feature-level attacks are a\npromising approach for rigorous interpretability research. They support the\ndesign of tools to better understand what a model has learned and diagnose\nbrittle feature associations. Code is available at\nhttps://github.com/thestephencasper/feature_level_adv\n","authors":["Stephen Casper","Max Nadeau","Dylan Hadfield-Menell","Gabriel Kreiman"],"pdf_url":"https://arxiv.org/pdf/2110.03605v7.pdf","comment":"NeurIPS 2022, code available at\n https://github.com/thestephencasper/feature_level_adv"},{"id":"http://arxiv.org/abs/2309.05589v1","updated":"2023-09-11T16:17:24Z","published":"2023-09-11T16:17:24Z","title":"Quantitative Analysis of Forecasting Models:In the Aspect of Online\n Political Bias","summary":" Understanding and mitigating political bias in online social media platforms\nare crucial tasks to combat misinformation and echo chamber effects. However,\ncharacterizing political bias temporally using computational methods presents\nchallenges due to the high frequency of noise in social media datasets. While\nexisting research has explored various approaches to political bias\ncharacterization, the ability to forecast political bias and anticipate how\npolitical conversations might evolve in the near future has not been\nextensively studied. In this paper, we propose a heuristic approach to classify\nsocial media posts into five distinct political leaning categories. Since there\nis a lack of prior work on forecasting political bias, we conduct an in-depth\nanalysis of existing baseline models to identify which model best fits to\nforecast political leaning time series. Our approach involves utilizing\nexisting time series forecasting models on two social media datasets with\ndifferent political ideologies, specifically Twitter and Gab. Through our\nexperiments and analyses, we seek to shed light on the challenges and\nopportunities in forecasting political bias in social media platforms.\nUltimately, our work aims to pave the way for developing more effective\nstrategies to mitigate the negative impact of political bias in the digital\nrealm.\n","authors":["Srinath Sai Tripuraneni","Sadia Kamal","Arunkumar Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2309.05589v1.pdf","comment":"This is a preprint version of a paper that is accepted to be\n presented as a poster at the ICMLA conference on December 15-17 2023"},{"id":"http://arxiv.org/abs/2309.05582v1","updated":"2023-09-11T16:10:58Z","published":"2023-09-11T16:10:58Z","title":"Mind the Uncertainty: Risk-Aware and Actively Exploring Model-Based\n Reinforcement Learning","summary":" We introduce a simple but effective method for managing risk in model-based\nreinforcement learning with trajectory sampling that involves probabilistic\nsafety constraints and balancing of optimism in the face of epistemic\nuncertainty and pessimism in the face of aleatoric uncertainty of an ensemble\nof stochastic neural networks.Various experiments indicate that the separation\nof uncertainties is essential to performing well with data-driven MPC\napproaches in uncertain and safety-critical control environments.\n","authors":["Marin Vlastelica","Sebastian Blaes","Cristina Pineri","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2309.05582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05775v2","updated":"2023-09-11T16:06:50Z","published":"2023-07-11T20:06:12Z","title":"Weisfeiler and Lehman Go Measurement Modeling: Probing the Validity of\n the WL Test","summary":" The expressive power of graph neural networks is usually measured by\ncomparing how many pairs of graphs or nodes an architecture can possibly\ndistinguish as non-isomorphic to those distinguishable by the $k$-dimensional\nWeisfeiler-Lehman ($k$-WL) test. In this paper, we uncover misalignments\nbetween graph machine learning practitioners' conceptualizations of expressive\npower and $k$-WL through a systematic analysis of the reliability and validity\nof $k$-WL. We conduct a survey ($n = 18$) of practitioners to surface their\nconceptualizations of expressive power and their assumptions about $k$-WL. In\ncontrast to practitioners' opinions, our analysis (which draws from graph\ntheory and benchmark auditing) reveals that $k$-WL does not guarantee isometry,\ncan be irrelevant to real-world graph tasks, and may not promote generalization\nor trustworthiness. We argue for extensional definitions and measurement of\nexpressive power based on benchmarks. We further contribute guiding questions\nfor constructing such benchmarks, which is critical for graph machine learning\npractitioners to develop and transparently communicate our understandings of\nexpressive power.\n","authors":["Arjun Subramonian","Adina Williams","Maximilian Nickel","Yizhou Sun","Levent Sagun"],"pdf_url":"https://arxiv.org/pdf/2307.05775v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05575v1","updated":"2023-09-11T16:03:00Z","published":"2023-09-11T16:03:00Z","title":"Anisotropic Diffusion Stencils: From Simple Derivations over Stability\n Estimates to ResNet Implementations","summary":" Anisotropic diffusion processes with a diffusion tensor are important in\nimage analysis, physics, and engineering. However, their numerical\napproximation has a strong impact on dissipative artefacts and deviations from\nrotation invariance. In this work, we study a large family of finite difference\ndiscretisations on a 3 x 3 stencil. We derive it by splitting 2-D anisotropic\ndiffusion into four 1-D diffusions. The resulting stencil class involves one\nfree parameter and covers a wide range of existing discretisations. It\ncomprises the full stencil family of Weickert et al. (2013) and shows that\ntheir two parameters contain redundancy. Furthermore, we establish a bound on\nthe spectral norm of the matrix corresponding to the stencil. This gives time\nstep size limits that guarantee stability of an explicit scheme in the\nEuclidean norm. Our directional splitting also allows a very natural\ntranslation of the explicit scheme into ResNet blocks. Employing neural network\nlibraries enables simple and highly efficient parallel implementations on GPUs.\n","authors":["Karl Schrader","Joachim Weickert","Michael Krause"],"pdf_url":"https://arxiv.org/pdf/2309.05575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05569v1","updated":"2023-09-11T15:54:30Z","published":"2023-09-11T15:54:30Z","title":"ITI-GEN: Inclusive Text-to-Image Generation","summary":" Text-to-image generative models often reflect the biases of the training\ndata, leading to unequal representations of underrepresented groups. This study\ninvestigates inclusive text-to-image generative models that generate images\nbased on human-written prompts and ensure the resulting images are uniformly\ndistributed across attributes of interest. Unfortunately, directly expressing\nthe desired attributes in the prompt often leads to sub-optimal results due to\nlinguistic ambiguity or model misrepresentation. Hence, this paper proposes a\ndrastically different approach that adheres to the maxim that \"a picture is\nworth a thousand words\". We show that, for some attributes, images can\nrepresent concepts more expressively than text. For instance, categories of\nskin tones are typically hard to specify by text but can be easily represented\nby example images. Building upon these insights, we propose a novel approach,\nITI-GEN, that leverages readily available reference images for Inclusive\nText-to-Image GENeration. The key idea is learning a set of prompt embeddings\nto generate images that can effectively represent all desired attribute\ncategories. More importantly, ITI-GEN requires no model fine-tuning, making it\ncomputationally efficient to augment existing text-to-image models. Extensive\nexperiments demonstrate that ITI-GEN largely improves over state-of-the-art\nmodels to generate inclusive images from a prompt. Project page:\nhttps://czhang0528.github.io/iti-gen.\n","authors":["Cheng Zhang","Xuanbai Chen","Siqi Chai","Chen Henry Wu","Dmitry Lagun","Thabo Beeler","Fernando De la Torre"],"pdf_url":"https://arxiv.org/pdf/2309.05569v1.pdf","comment":"Accepted to ICCV 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2103.07155v2","updated":"2023-09-11T15:49:30Z","published":"2021-03-12T09:03:51Z","title":"Explainable AI by BAPC -- Before and After correction Parameter\n Comparison","summary":" A local surrogate for an AI-model correcting a simpler 'base' model is\nintroduced representing an analytical method to yield explanations of\nAI-predictions. The approach is studied here in the context of the base model\nbeing linear regression. The AI-model approximates the residual error of the\nlinear model and the explanations are formulated in terms of the change of the\ninterpretable base model's parameters. Criteria are formulated for the precise\nrelation between lost accuracy of the surrogate, the accuracy of the AI-model,\nand the surrogate fidelity. It is shown that, assuming a certain maximal amount\nof noise in the observed data, these criteria induce neighborhoods of the\ninstances to be explained which have an ideal size in terms of maximal accuracy\nand fidelity.\n","authors":["Florian Sobieczky","Manuela Geiß"],"pdf_url":"https://arxiv.org/pdf/2103.07155v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07896v3","updated":"2023-09-11T15:39:42Z","published":"2023-08-15T17:37:44Z","title":"SciRE-Solver: Accelerating Diffusion Models Sampling by Score-integrand\n Solver with Recursive Difference","summary":" Diffusion models (DMs) have made significant progress in the fields of image,\naudio, and video generation. One downside of DMs is their slow iterative\nprocess. Recent algorithms for fast sampling are designed from the perspective\nof differential equations. However, in higher-order algorithms based on Taylor\nexpansion, estimating the derivative of the score function becomes intractable\ndue to the complexity of large-scale, well-trained neural networks. Driven by\nthis motivation, in this work, we introduce the recursive difference (RD)\nmethod to calculate the derivative of the score function in the realm of DMs.\nBased on the RD method and the truncated Taylor expansion of score-integrand,\nwe propose SciRE-Solver with the convergence order guarantee for accelerating\nsampling of DMs. To further investigate the effectiveness of the RD method, we\nalso propose a variant named SciREI-Solver based on the RD method and\nexponential integrator. Our proposed sampling algorithms with RD method attain\nstate-of-the-art (SOTA) FIDs in comparison to existing training-free sampling\nalgorithms, across both discrete-time and continuous-time pre-trained DMs,\nunder various number of score function evaluations (NFE). Remarkably,\nSciRE-Solver using a small NFEs demonstrates promising potential to surpass the\nFID achieved by some pre-trained models in their original papers using no fewer\nthan $1000$ NFEs. For example, we reach SOTA value of $2.40$ FID with $100$ NFE\nfor continuous-time DM and of $3.15$ FID with $84$ NFE for discrete-time DM on\nCIFAR-10, as well as of $2.17$ (2.02) FID with $18$ (50) NFE for discrete-time\nDM on CelebA 64$\\times$64.\n","authors":["Shigui Li","Wei Chen","Delu Zeng"],"pdf_url":"https://arxiv.org/pdf/2308.07896v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08796v3","updated":"2023-09-11T15:34:43Z","published":"2022-11-16T09:48:14Z","title":"Model Based Residual Policy Learning with Applications to Antenna\n Control","summary":" Non-differentiable controllers and rule-based policies are widely used for\ncontrolling real systems such as telecommunication networks and robots.\nSpecifically, parameters of mobile network base station antennas can be\ndynamically configured by these policies to improve users coverage and quality\nof service. Motivated by the antenna tilt control problem, we introduce\nModel-Based Residual Policy Learning (MBRPL), a practical reinforcement\nlearning (RL) method. MBRPL enhances existing policies through a model-based\napproach, leading to improved sample efficiency and a decreased number of\ninteractions with the actual environment when compared to off-the-shelf RL\nmethods.To the best of our knowledge, this is the first paper that examines a\nmodel-based approach for antenna control. Experimental results reveal that our\nmethod delivers strong initial performance while improving sample efficiency\nover previous RL methods, which is one step towards deploying these algorithms\nin real networks.\n","authors":["Viktor Eriksson Möllerstedt","Alessio Russo","Maxime Bouton"],"pdf_url":"https://arxiv.org/pdf/2211.08796v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05548v1","updated":"2023-09-11T15:33:00Z","published":"2023-09-11T15:33:00Z","title":"Distance-Aware eXplanation Based Learning","summary":" eXplanation Based Learning (XBL) is an interactive learning approach that\nprovides a transparent method of training deep learning models by interacting\nwith their explanations. XBL augments loss functions to penalize a model based\non deviation of its explanations from user annotation of image features. The\nliterature on XBL mostly depends on the intersection of visual model\nexplanations and image feature annotations. We present a method to add a\ndistance-aware explanation loss to categorical losses that trains a learner to\nfocus on important regions of a training dataset. Distance is an appropriate\napproach for calculating explanation loss since visual model explanations such\nas Gradient-weighted Class Activation Mapping (Grad-CAMs) are not strictly\nbounded as annotations and their intersections may not provide complete\ninformation on the deviation of a model's focus from relevant image regions. In\naddition to assessing our model using existing metrics, we propose an\ninterpretability metric for evaluating visual feature-attribution based model\nexplanations that is more informative of the model's performance than existing\nmetrics. We demonstrate performance of our proposed method on three image\nclassification tasks.\n","authors":["Misgina Tsighe Hagos","Niamh Belton","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2309.05548v1.pdf","comment":"Accepted at the 35th IEEE International Conference on Tools with\n Artificial Intelligence, ICTAI 2023"},{"id":"http://arxiv.org/abs/2301.00776v2","updated":"2023-09-11T15:30:41Z","published":"2023-01-02T17:51:23Z","title":"Physics-Informed Neural Networks for Prognostics and Health Management\n of Lithium-Ion Batteries","summary":" For Prognostics and Health Management (PHM) of Lithium-ion (Li-ion)\nbatteries, many models have been established to characterize their degradation\nprocess. The existing empirical or physical models can reveal important\ninformation regarding the degradation dynamics. However, there are no general\nand flexible methods to fuse the information represented by those models.\nPhysics-Informed Neural Network (PINN) is an efficient tool to fuse empirical\nor physical dynamic models with data-driven models. To take full advantage of\nvarious information sources, we propose a model fusion scheme based on PINN. It\nis implemented by developing a semi-empirical semi-physical Partial\nDifferential Equation (PDE) to model the degradation dynamics of Li-ion\nbatteries. When there is little prior knowledge about the dynamics, we leverage\nthe data-driven Deep Hidden Physics Model (DeepHPM) to discover the underlying\ngoverning dynamic models. The uncovered dynamics information is then fused with\nthat mined by the surrogate neural network in the PINN framework. Moreover, an\nuncertainty-based adaptive weighting method is employed to balance the multiple\nlearning tasks when training the PINN. The proposed methods are verified on a\npublic dataset of Li-ion Phosphate (LFP)/graphite batteries.\n","authors":["Pengfei Wen","Zhi-Sheng Ye","Yong Li","Shaowei Chen","Pu Xie","Shuai Zhao"],"pdf_url":"https://arxiv.org/pdf/2301.00776v2.pdf","comment":"14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.05525v1","updated":"2023-09-11T15:10:41Z","published":"2023-09-11T15:10:41Z","title":"Advancing Federated Learning in 6G: A Trusted Architecture with\n Graph-based Analysis","summary":" Integrating native AI support into the network architecture is an essential\nobjective of 6G. Federated Learning (FL) emerges as a potential paradigm,\nfacilitating decentralized AI model training across a diverse range of devices\nunder the coordination of a central server. However, several challenges hinder\nits wide application in the 6G context, such as malicious attacks and privacy\nsnooping on local model updates, and centralization pitfalls. This work\nproposes a trusted architecture for supporting FL, which utilizes Distributed\nLedger Technology (DLT) and Graph Neural Network (GNN), including three key\nfeatures. First, a pre-processing layer employing homomorphic encryption is\nincorporated to securely aggregate local models, preserving the privacy of\nindividual models. Second, given the distributed nature and graph structure\nbetween clients and nodes in the pre-processing layer, GNN is leveraged to\nidentify abnormal local models, enhancing system security. Third, DLT is\nutilized to decentralize the system by selecting one of the candidates to\nperform the central server's functions. Additionally, DLT ensures reliable data\nmanagement by recording data exchanges in an immutable and transparent ledger.\nThe feasibility of the novel architecture is validated through simulations,\ndemonstrating improved performance in anomalous model detection and global\nmodel accuracy compared to relevant baselines.\n","authors":["Wenxuan Ye","Chendi Qian","Xueli An","Xueqiang Yan","Georg Carle"],"pdf_url":"https://arxiv.org/pdf/2309.05525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05521v1","updated":"2023-09-11T15:04:46Z","published":"2023-09-11T15:04:46Z","title":"Re-formalization of Individual Fairness","summary":" The notion of individual fairness is a formalization of an ethical principle,\n\"Treating like cases alike,\" which has been argued such as by Aristotle. In a\nfairness-aware machine learning context, Dwork et al. firstly formalized the\nnotion. In their formalization, a similar pair of data in an unfair space\nshould be mapped to similar positions in a fair space. We propose to\nre-formalize individual fairness by the statistical independence conditioned by\nindividuals. This re-formalization has the following merits. First, our\nformalization is compatible with that of Dwork et al. Second, our formalization\nenables to combine individual fairness with the fairness notion, equalized odds\nor sufficiency, as well as statistical parity. Third, though their\nformalization implicitly assumes a pre-process approach for making fair\nprediction, our formalization is applicable to an in-process or post-process\napproach.\n","authors":["Toshihiro Kamishima"],"pdf_url":"https://arxiv.org/pdf/2309.05521v1.pdf","comment":"Published at the 6th FAccTRec Workshop: Responsible Recommendation"},{"id":"http://arxiv.org/abs/2309.05519v1","updated":"2023-09-11T15:02:25Z","published":"2023-09-11T15:02:25Z","title":"NExT-GPT: Any-to-Any Multimodal LLM","summary":" While recently Multimodal Large Language Models (MM-LLMs) have made exciting\nstrides, they mostly fall prey to the limitation of only input-side multimodal\nunderstanding, without the ability to produce content in multiple modalities.\nAs we humans always perceive the world and communicate with people through\nvarious modalities, developing any-to-any MM-LLMs capable of accepting and\ndelivering content in any modality becomes essential to human-level AI. To fill\nthe gap, we present an end-to-end general-purpose any-to-any MM-LLM system,\nNExT-GPT. We connect an LLM with multimodal adaptors and different diffusion\ndecoders, enabling NExT-GPT to perceive inputs and generate outputs in\narbitrary combinations of text, images, videos, and audio. By leveraging the\nexisting well-trained highly-performing encoders and decoders, NExT-GPT is\ntuned with only a small amount of parameter (1%) of certain projection layers,\nwhich not only benefits low-cost training and also facilitates convenient\nexpansion to more potential modalities. Moreover, we introduce a\nmodality-switching instruction tuning (MosIT) and manually curate a\nhigh-quality dataset for MosIT, based on which NExT-GPT is empowered with\ncomplex cross-modal semantic understanding and content generation. Overall, our\nresearch showcases the promising possibility of building an AI agent capable of\nmodeling universal modalities, paving the way for more human-like AI research\nin the community.\n","authors":["Shengqiong Wu","Hao Fei","Leigang Qu","Wei Ji","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.05519v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2309.05517v1","updated":"2023-09-11T15:00:01Z","published":"2023-09-11T15:00:01Z","title":"Stream-based Active Learning by Exploiting Temporal Properties in\n Perception with Temporal Predicted Loss","summary":" Active learning (AL) reduces the amount of labeled data needed to train a\nmachine learning model by intelligently choosing which instances to label.\nClassic pool-based AL requires all data to be present in a datacenter, which\ncan be challenging with the increasing amounts of data needed in deep learning.\nHowever, AL on mobile devices and robots, like autonomous cars, can filter the\ndata from perception sensor streams before reaching the datacenter. We\nexploited the temporal properties for such image streams in our work and\nproposed the novel temporal predicted loss (TPL) method. To evaluate the\nstream-based setting properly, we introduced the GTA V streets and the A2D2\nstreets dataset and made both publicly available. Our experiments showed that\nour approach significantly improves the diversity of the selection while being\nan uncertainty-based method. As pool-based approaches are more common in\nperception applications, we derived a concept for comparing pool-based and\nstream-based AL, where TPL out-performed state-of-the-art pool- or stream-based\napproaches for different models. TPL demonstrated a gain of 2.5 precept points\n(pp) less required data while being significantly faster than pool-based\nmethods.\n","authors":["Sebastian Schmidt","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2309.05517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05516v1","updated":"2023-09-11T14:58:23Z","published":"2023-09-11T14:58:23Z","title":"Optimize Weight Rounding via Signed Gradient Descent for the\n Quantization of LLMs","summary":" Large Language Models (LLMs) have proven their exceptional capabilities in\nperforming language-related tasks. However, their deployment poses significant\nchallenges due to their considerable memory and storage requirements. In\nresponse to this issue, weight-only quantization, particularly 3 and 4-bit\nweight-only quantization, has emerged as one of the most viable solutions. As\nthe number of bits decreases, the quantization grid broadens, thus emphasizing\nthe importance of up and down rounding. While previous studies have\ndemonstrated that fine-tuning up and down rounding with the addition of\nperturbations can enhance accuracy in some scenarios, our study is driven by\nthe precise and limited boundary of these perturbations, where only the\nthreshold for altering the rounding value is of significance. Consequently, we\npropose a concise and highly effective approach for optimizing the weight\nrounding task. Our method, named SignRound, involves lightweight block-wise\ntuning using signed gradient descent, enabling us to achieve outstanding\nresults within 400 steps. SignRound outperforms the established baseline of\nrounding-to-nearest (RTN) and competes impressively against recent methods,\nwithout introducing additional inference overhead. The source code will be\npublicly available at https://github.com/intel/neural-compressor soon.\n","authors":["Wenhua Cheng","Weiwei Zhang","Haihao Shen","Yiyang Cai","Xin He","Kaokao Lv"],"pdf_url":"https://arxiv.org/pdf/2309.05516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03307v2","updated":"2023-09-11T14:54:49Z","published":"2023-08-22T18:35:51Z","title":"Several fitness functions and entanglement gates in quantum kernel\n generation","summary":" Quantum machine learning (QML) represents a promising frontier in the realm\nof quantum technologies. In this pursuit of quantum advantage, the quantum\nkernel method for support vector machine has emerged as a powerful approach.\nEntanglement, a fundamental concept in quantum mechanics, assumes a central\nrole in quantum computing. In this paper, we study the necessities of\nentanglement gates in the quantum kernel methods. We present several fitness\nfunctions for a multi-objective genetic algorithm that simultaneously maximizes\nclassification accuracy while minimizing both the local and non-local gate\ncosts of the quantum feature map's circuit. We conduct comparisons with\nclassical classifiers to gain insights into the benefits of employing\nentanglement gates. Surprisingly, our experiments reveal that the optimal\nconfiguration of quantum circuits for the quantum kernel method incorporates a\nproportional number of non-local gates for entanglement, contrary to previous\nliterature where non-local gates were largely suppressed.\n Furthermore, we demonstrate that the separability indexes of data can be\neffectively leveraged to determine the number of non-local gates required for\nthe quantum support vector machine's feature maps. This insight can\nsignificantly aid in selecting appropriate parameters, such as the entanglement\nparameter, in various quantum programming packages like https://qiskit.org/\nbased on data analysis. Our findings offer valuable guidance for enhancing the\nefficiency and accuracy of quantum machine learning algorithm\n","authors":["Haiyan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.03307v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05505v1","updated":"2023-09-11T14:46:55Z","published":"2023-09-11T14:46:55Z","title":"Share Your Representation Only: Guaranteed Improvement of the\n Privacy-Utility Tradeoff in Federated Learning","summary":" Repeated parameter sharing in federated learning causes significant\ninformation leakage about private data, thus defeating its main purpose: data\nprivacy. Mitigating the risk of this information leakage, using state of the\nart differentially private algorithms, also does not come for free. Randomized\nmechanisms can prevent convergence of models on learning even the useful\nrepresentation functions, especially if there is more disagreement between\nlocal models on the classification functions (due to data heterogeneity). In\nthis paper, we consider a representation federated learning objective that\nencourages various parties to collaboratively refine the consensus part of the\nmodel, with differential privacy guarantees, while separately allowing\nsufficient freedom for local personalization (without releasing it). We prove\nthat in the linear representation setting, while the objective is non-convex,\nour proposed new algorithm \\DPFEDREP\\ converges to a ball centered around the\n\\emph{global optimal} solution at a linear rate, and the radius of the ball is\nproportional to the reciprocal of the privacy budget. With this novel utility\nanalysis, we improve the SOTA utility-privacy trade-off for this problem by a\nfactor of $\\sqrt{d}$, where $d$ is the input dimension. We empirically evaluate\nour method with the image classification task on CIFAR10, CIFAR100, and EMNIST,\nand observe a significant performance improvement over the prior work under the\nsame small privacy budget. The code can be found in this link:\nhttps://github.com/shenzebang/CENTAUR-Privacy-Federated-Representation-Learning.\n","authors":["Zebang Shen","Jiayuan Ye","Anmin Kang","Hamed Hassani","Reza Shokri"],"pdf_url":"https://arxiv.org/pdf/2309.05505v1.pdf","comment":"ICLR 2023 revised"},{"id":"http://arxiv.org/abs/2210.13532v2","updated":"2023-09-11T14:37:33Z","published":"2022-10-24T18:33:35Z","title":"Adaptive Top-K in SGD for Communication-Efficient Distributed Learning","summary":" Distributed stochastic gradient descent (SGD) with gradient compression has\nbecome a popular communication-efficient solution for accelerating distributed\nlearning. One commonly used method for gradient compression is Top-K\nsparsification, which sparsifies the gradients by a fixed degree during model\ntraining. However, there has been a lack of an adaptive approach to adjust the\nsparsification degree to maximize the potential of the model's performance or\ntraining speed. This paper proposes a novel adaptive Top-K in SGD framework\nthat enables an adaptive degree of sparsification for each gradient descent\nstep to optimize the convergence performance by balancing the trade-off between\ncommunication cost and convergence error. Firstly, an upper bound of\nconvergence error is derived for the adaptive sparsification scheme and the\nloss function. Secondly, an algorithm is designed to minimize the convergence\nerror under the communication cost constraints. Finally, numerical results on\nthe MNIST and CIFAR-10 datasets demonstrate that the proposed adaptive Top-K\nalgorithm in SGD achieves a significantly better convergence rate compared to\nstate-of-the-art methods, even after considering error compensation.\n","authors":["Mengzhe Ruan","Guangfeng Yan","Yuanzhang Xiao","Linqi Song","Weitao Xu"],"pdf_url":"https://arxiv.org/pdf/2210.13532v2.pdf","comment":"6 pages, 10 figures, has been accepted by GlobeCom 2023"},{"id":"http://arxiv.org/abs/2309.05490v1","updated":"2023-09-11T14:32:04Z","published":"2023-09-11T14:32:04Z","title":"Learning Semantic Segmentation with Query Points Supervision on Aerial\n Images","summary":" Semantic segmentation is crucial in remote sensing, where high-resolution\nsatellite images are segmented into meaningful regions. Recent advancements in\ndeep learning have significantly improved satellite image segmentation.\nHowever, most of these methods are typically trained in fully supervised\nsettings that require high-quality pixel-level annotations, which are expensive\nand time-consuming to obtain. In this work, we present a weakly supervised\nlearning algorithm to train semantic segmentation algorithms that only rely on\nquery point annotations instead of full mask labels. Our proposed approach\nperforms accurate semantic segmentation and improves efficiency by\nsignificantly reducing the cost and time required for manual annotation.\nSpecifically, we generate superpixels and extend the query point labels into\nthose superpixels that group similar meaningful semantics. Then, we train\nsemantic segmentation models, supervised with images partially labeled with the\nsuperpixels pseudo-labels. We benchmark our weakly supervised training approach\non an aerial image dataset and different semantic segmentation architectures,\nshowing that we can reach competitive performance compared to fully supervised\ntraining while reducing the annotation effort.\n","authors":["Santiago Rivier","Carlos Hinojosa","Silvio Giancola","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2309.05490v1.pdf","comment":"Paper presented at the LXCV workshop at ICCV 2023"},{"id":"http://arxiv.org/abs/2302.09700v2","updated":"2023-09-11T14:19:05Z","published":"2023-02-20T00:15:45Z","title":"Leveraging Reviews: Learning to Price with Buyer and Seller Uncertainty","summary":" In online marketplaces, customers have access to hundreds of reviews for a\nsingle product. Buyers often use reviews from other customers that share their\ntype -- such as height for clothing, skin type for skincare products, and\nlocation for outdoor furniture -- to estimate their values, which they may not\nknow a priori. Customers with few relevant reviews may hesitate to make a\npurchase except at a low price, so for the seller, there is a tension between\nsetting high prices and ensuring that there are enough reviews so that buyers\ncan confidently estimate their values. Simultaneously, sellers may use reviews\nto gauge the demand for items they wish to sell.\n In this work, we study this pricing problem in an online setting where the\nseller interacts with a set of buyers of finitely many types, one by one, over\na series of $T$ rounds. At each round, the seller first sets a price. Then a\nbuyer arrives and examines the reviews of the previous buyers with the same\ntype, which reveal those buyers' ex-post values. Based on the reviews, the\nbuyer decides to purchase if they have good reason to believe that their\nex-ante utility is positive. Crucially, the seller does not know the buyer's\ntype when setting the price, nor even the distribution over types. We provide a\nno-regret algorithm that the seller can use to obtain high revenue. When there\nare $d$ types, after $T$ rounds, our algorithm achieves a problem-independent\n$\\tilde O(T^{2/3}d^{1/3})$ regret bound. However, when the smallest probability\n$q_{\\text{min}}$ that any given type appears is large, specifically when\n$q_{\\text{min}} \\in \\Omega(d^{-2/3}T^{-1/3})$, then the same algorithm achieves\na $\\tilde O(T^{1/2}q_{\\text{min}}^{-1/2})$ regret bound. We complement these\nupper bounds with matching lower bounds in both regimes, showing that our\nalgorithm is minimax optimal up to lower-order terms.\n","authors":["Wenshuo Guo","Nika Haghtalab","Kirthevasan Kandasamy","Ellen Vitercik"],"pdf_url":"https://arxiv.org/pdf/2302.09700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04504v2","updated":"2023-09-11T14:18:42Z","published":"2023-07-10T11:56:04Z","title":"An Algorithm with Optimal Dimension-Dependence for Zero-Order Nonsmooth\n Nonconvex Stochastic Optimization","summary":" We study the complexity of producing $(\\delta,\\epsilon)$-stationary points of\nLipschitz objectives which are possibly neither smooth nor convex, using only\nnoisy function evaluations. Recent works proposed several stochastic zero-order\nalgorithms that solve this task, all of which suffer from a\ndimension-dependence of $\\Omega(d^{3/2})$ where $d$ is the dimension of the\nproblem, which was conjectured to be optimal. We refute this conjecture by\nproviding a faster algorithm that has complexity\n$O(d\\delta^{-1}\\epsilon^{-3})$, which is optimal (up to numerical constants)\nwith respect to $d$ and also optimal with respect to the accuracy parameters\n$\\delta,\\epsilon$, thus solving an open question due to Lin et al.\n(NeurIPS'22). Moreover, the convergence rate achieved by our algorithm is also\noptimal for smooth objectives, proving that in the nonconvex stochastic\nzero-order setting, nonsmooth optimization is as easy as smooth optimization.\nWe provide algorithms that achieve the aforementioned convergence rate in\nexpectation as well as with high probability. Our analysis is based on a simple\nyet powerful geometric lemma regarding the Goldstein-subdifferential set, which\nallows utilizing recent advancements in first-order nonsmooth nonconvex\noptimization.\n","authors":["Guy Kornowski","Ohad Shamir"],"pdf_url":"https://arxiv.org/pdf/2307.04504v2.pdf","comment":"Fixed hyperparameter assignments in main theorems (results\n unaffected); some minor edits"},{"id":"http://arxiv.org/abs/2309.05477v1","updated":"2023-09-11T14:16:37Z","published":"2023-09-11T14:16:37Z","title":"Learning Objective-Specific Active Learning Strategies with Attentive\n Neural Processes","summary":" Pool-based active learning (AL) is a promising technology for increasing\ndata-efficiency of machine learning models. However, surveys show that\nperformance of recent AL methods is very sensitive to the choice of dataset and\ntraining setting, making them unsuitable for general application. In order to\ntackle this problem, the field Learning Active Learning (LAL) suggests to learn\nthe active learning strategy itself, allowing it to adapt to the given setting.\nIn this work, we propose a novel LAL method for classification that exploits\nsymmetry and independence properties of the active learning problem with an\nAttentive Conditional Neural Process model. Our approach is based on learning\nfrom a myopic oracle, which gives our model the ability to adapt to\nnon-standard objectives, such as those that do not equally weight the error on\nall data points. We experimentally verify that our Neural Process model\noutperforms a variety of baselines in these settings. Finally, our experiments\nshow that our model exhibits a tendency towards improved stability to changing\ndatasets. However, performance is sensitive to choice of classifier and more\nwork is necessary to reduce the performance the gap with the myopic oracle and\nto improve scalability. We present our work as a proof-of-concept for LAL on\nnonstandard objectives and hope our analysis and modelling considerations\ninspire future LAL work.\n","authors":["Tim Bakker","Herke van Hoof","Max Welling"],"pdf_url":"https://arxiv.org/pdf/2309.05477v1.pdf","comment":"Accepted at ECML 2023"},{"id":"http://arxiv.org/abs/2309.05473v1","updated":"2023-09-11T14:13:30Z","published":"2023-09-11T14:13:30Z","title":"Machine learning the dimension of a Fano variety","summary":" Fano varieties are basic building blocks in geometry - they are `atomic\npieces' of mathematical shapes. Recent progress in the classification of Fano\nvarieties involves analysing an invariant called the quantum period. This is a\nsequence of integers which gives a numerical fingerprint for a Fano variety. It\nis conjectured that a Fano variety is uniquely determined by its quantum\nperiod. If this is true, one should be able to recover geometric properties of\na Fano variety directly from its quantum period. We apply machine learning to\nthe question: does the quantum period of X know the dimension of X? Note that\nthere is as yet no theoretical understanding of this. We show that a simple\nfeed-forward neural network can determine the dimension of X with 98% accuracy.\nBuilding on this, we establish rigorous asymptotics for the quantum periods of\na class of Fano varieties. These asymptotics determine the dimension of X from\nits quantum period. Our results demonstrate that machine learning can pick out\nstructure from complex mathematical data in situations where we lack\ntheoretical understanding. They also give positive evidence for the conjecture\nthat the quantum period of a Fano variety determines that variety.\n","authors":["Tom Coates","Alexander M. Kasprzyk","Sara Veneziale"],"pdf_url":"https://arxiv.org/pdf/2309.05473v1.pdf","comment":"28 pages, 5 tables, 23 figures. This version of the article has been\n accepted for publication, after peer review but is not the Version of Record\n and does not reflect post-acceptance improvements, or any corrections"},{"id":"http://arxiv.org/abs/2309.01838v2","updated":"2023-09-11T14:09:53Z","published":"2023-09-04T22:25:49Z","title":"Efficient Defense Against Model Stealing Attacks on Convolutional Neural\n Networks","summary":" Model stealing attacks have become a serious concern for deep learning\nmodels, where an attacker can steal a trained model by querying its black-box\nAPI. This can lead to intellectual property theft and other security and\nprivacy risks. The current state-of-the-art defenses against model stealing\nattacks suggest adding perturbations to the prediction probabilities. However,\nthey suffer from heavy computations and make impracticable assumptions about\nthe adversary. They often require the training of auxiliary models. This can be\ntime-consuming and resource-intensive which hinders the deployment of these\ndefenses in real-world applications. In this paper, we propose a simple yet\neffective and efficient defense alternative. We introduce a heuristic approach\nto perturb the output probabilities. The proposed defense can be easily\nintegrated into models without additional training. We show that our defense is\neffective in defending against three state-of-the-art stealing attacks. We\nevaluate our approach on large and quantized (i.e., compressed) Convolutional\nNeural Networks (CNNs) trained on several vision datasets. Our technique\noutperforms the state-of-the-art defenses with a $\\times37$ faster inference\nlatency without requiring any additional model and with a low impact on the\nmodel's performance. We validate that our defense is also effective for\nquantized CNNs targeting edge devices.\n","authors":["Kacem Khaled","Mouna Dhaouadi","Felipe Gohring de Magalhães","Gabriela Nicolescu"],"pdf_url":"https://arxiv.org/pdf/2309.01838v2.pdf","comment":"Accepted for publication at 2023 International Conference on Machine\n Learning and Applications (ICMLA). Proceedings of ICMLA, Florida, USA\n \\c{opyright}2023 IEEE"},{"id":"http://arxiv.org/abs/2309.02869v2","updated":"2023-09-11T14:09:36Z","published":"2023-09-06T09:47:36Z","title":"On Reducing Undesirable Behavior in Deep Reinforcement Learning Models","summary":" Deep reinforcement learning (DRL) has proven extremely useful in a large\nvariety of application domains. However, even successful DRL-based software can\nexhibit highly undesirable behavior. This is due to DRL training being based on\nmaximizing a reward function, which typically captures general trends but\ncannot precisely capture, or rule out, certain behaviors of the system. In this\npaper, we propose a novel framework aimed at drastically reducing the\nundesirable behavior of DRL-based software, while maintaining its excellent\nperformance. In addition, our framework can assist in providing engineers with\na comprehensible characterization of such undesirable behavior. Under the hood,\nour approach is based on extracting decision tree classifiers from erroneous\nstate-action pairs, and then integrating these trees into the DRL training\nloop, penalizing the system whenever it performs an error. We provide a\nproof-of-concept implementation of our approach, and use it to evaluate the\ntechnique on three significant case studies. We find that our approach can\nextend existing frameworks in a straightforward manner, and incurs only a\nslight overhead in training time. Further, it incurs only a very slight hit to\nperformance, or even in some cases - improves it, while significantly reducing\nthe frequency of undesirable behavior.\n","authors":["Ophir M. Carmel","Guy Katz"],"pdf_url":"https://arxiv.org/pdf/2309.02869v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09993v6","updated":"2023-09-11T13:58:44Z","published":"2022-12-20T04:33:32Z","title":"Are Deep Neural Networks SMARTer than Second Graders?","summary":" Recent times have witnessed an increasing number of applications of deep\nneural networks towards solving tasks that require superior cognitive\nabilities, e.g., playing Go, generating art, ChatGPT, etc. Such a dramatic\nprogress raises the question: how generalizable are neural networks in solving\nproblems that demand broad skills? To answer this question, we propose SMART: a\nSimple Multimodal Algorithmic Reasoning Task and the associated SMART-101\ndataset, for evaluating the abstraction, deduction, and generalization\nabilities of neural networks in solving visuo-linguistic puzzles designed\nspecifically for children in the 6--8 age group. Our dataset consists of 101\nunique puzzles; each puzzle comprises a picture and a question, and their\nsolution needs a mix of several elementary skills, including arithmetic,\nalgebra, and spatial reasoning, among others. To scale our dataset towards\ntraining deep neural networks, we programmatically generate entirely new\ninstances for each puzzle, while retaining their solution algorithm. To\nbenchmark performances on SMART-101, we propose a vision and language\nmeta-learning model using varied state-of-the-art backbones. Our experiments\nreveal that while powerful deep models offer reasonable performances on puzzles\nin a supervised setting, they are not better than random accuracy when analyzed\nfor generalization. We also evaluate the recent ChatGPT and other large\nlanguage models on a subset of SMART-101 and find that while these models show\nconvincing reasoning abilities, the answers are often incorrect.\n","authors":["Anoop Cherian","Kuan-Chuan Peng","Suhas Lohit","Kevin A. Smith","Joshua B. Tenenbaum"],"pdf_url":"https://arxiv.org/pdf/2212.09993v6.pdf","comment":"Extended version of CVPR 2023 paper. For the SMART-101 dataset, see\n http://smartdataset.github.io/smart101"},{"id":"http://arxiv.org/abs/2309.05457v1","updated":"2023-09-11T13:51:40Z","published":"2023-09-11T13:51:40Z","title":"Unveiling the Sentinels: Assessing AI Performance in Cybersecurity Peer\n Review","summary":" Peer review is the method employed by the scientific community for evaluating\nresearch advancements. In the field of cybersecurity, the practice of\ndouble-blind peer review is the de-facto standard. This paper touches on the\nholy grail of peer reviewing and aims to shed light on the performance of AI in\nreviewing for academic security conferences. Specifically, we investigate the\npredictability of reviewing outcomes by comparing the results obtained from\nhuman reviewers and machine-learning models. To facilitate our study, we\nconstruct a comprehensive dataset by collecting thousands of papers from\nrenowned computer science conferences and the arXiv preprint website. Based on\nthe collected data, we evaluate the prediction capabilities of ChatGPT and a\ntwo-stage classification approach based on the Doc2Vec model with various\nclassifiers. Our experimental evaluation of review outcome prediction using the\nDoc2Vec-based approach performs significantly better than the ChatGPT and\nachieves an accuracy of over 90%. While analyzing the experimental results, we\nidentify the potential advantages and limitations of the tested ML models. We\nexplore areas within the paper-reviewing process that can benefit from\nautomated support approaches, while also recognizing the irreplaceable role of\nhuman intellect in certain aspects that cannot be matched by state-of-the-art\nAI techniques.\n","authors":["Liang Niu","Nian Xue","Christina Pöpper"],"pdf_url":"https://arxiv.org/pdf/2309.05457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05455v1","updated":"2023-09-11T13:51:06Z","published":"2023-09-11T13:51:06Z","title":"Diffusion-Based Co-Speech Gesture Generation Using Joint Text and Audio\n Representation","summary":" This paper describes a system developed for the GENEA (Generation and\nEvaluation of Non-verbal Behaviour for Embodied Agents) Challenge 2023. Our\nsolution builds on an existing diffusion-based motion synthesis model. We\npropose a contrastive speech and motion pretraining (CSMP) module, which learns\na joint embedding for speech and gesture with the aim to learn a semantic\ncoupling between these modalities. The output of the CSMP module is used as a\nconditioning signal in the diffusion-based gesture synthesis model in order to\nachieve semantically-aware co-speech gesture generation. Our entry achieved\nhighest human-likeness and highest speech appropriateness rating among the\nsubmitted entries. This indicates that our system is a promising approach to\nachieve human-like co-speech gestures in agents that carry semantic meaning.\n","authors":["Anna Deichler","Shivam Mehta","Simon Alexanderson","Jonas Beskow"],"pdf_url":"https://arxiv.org/pdf/2309.05455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05444v1","updated":"2023-09-11T13:31:00Z","published":"2023-09-11T13:31:00Z","title":"Pushing Mixture of Experts to the Limit: Extremely Parameter Efficient\n MoE for Instruction Tuning","summary":" The Mixture of Experts (MoE) is a widely known neural architecture where an\nensemble of specialized sub-models optimizes overall performance with a\nconstant computational cost. However, conventional MoEs pose challenges at\nscale due to the need to store all experts in memory. In this paper, we push\nMoE to the limit. We propose extremely parameter-efficient MoE by uniquely\ncombining MoE architecture with lightweight experts.Our MoE architecture\noutperforms standard parameter-efficient fine-tuning (PEFT) methods and is on\npar with full fine-tuning by only updating the lightweight experts -- less than\n1% of an 11B parameters model. Furthermore, our method generalizes to unseen\ntasks as it does not depend on any prior task knowledge. Our research\nunderscores the versatility of the mixture of experts architecture, showcasing\nits ability to deliver robust performance even when subjected to rigorous\nparameter constraints. Our code used in all the experiments is publicly\navailable here: https://github.com/for-ai/parameter-efficient-moe.\n","authors":["Ted Zadouri","Ahmet Üstün","Arash Ahmadian","Beyza Ermiş","Acyr Locatelli","Sara Hooker"],"pdf_url":"https://arxiv.org/pdf/2309.05444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05436v1","updated":"2023-09-11T13:18:19Z","published":"2023-09-11T13:18:19Z","title":"Quantized Fourier and Polynomial Features for more Expressive Tensor\n Network Models","summary":" In the context of kernel machines, polynomial and Fourier features are\ncommonly used to provide a nonlinear extension to linear models by mapping the\ndata to a higher-dimensional space. Unless one considers the dual formulation\nof the learning problem, which renders exact large-scale learning unfeasible,\nthe exponential increase of model parameters in the dimensionality of the data\ncaused by their tensor-product structure prohibits to tackle high-dimensional\nproblems. One of the possible approaches to circumvent this exponential scaling\nis to exploit the tensor structure present in the features by constraining the\nmodel weights to be an underparametrized tensor network. In this paper we\nquantize, i.e. further tensorize, polynomial and Fourier features. Based on\nthis feature quantization we propose to quantize the associated model weights,\nyielding quantized models. We show that, for the same number of model\nparameters, the resulting quantized models have a higher bound on the\nVC-dimension as opposed to their non-quantized counterparts, at no additional\ncomputational cost while learning from identical features. We verify\nexperimentally how this additional tensorization regularizes the learning\nproblem by prioritizing the most salient features in the data and how it\nprovides models with increased generalization capabilities. We finally\nbenchmark our approach on large regression task, achieving state-of-the-art\nresults on a laptop computer.\n","authors":["Frederiek Wesel","Kim Batselier"],"pdf_url":"https://arxiv.org/pdf/2309.05436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05434v1","updated":"2023-09-11T13:13:54Z","published":"2023-09-11T13:13:54Z","title":"A parameterised model for link prediction using node centrality and\n similarity measure based on graph embedding","summary":" Link prediction is a key aspect of graph machine learning, with applications\nas diverse as disease prediction, social network recommendations, and drug\ndiscovery. It involves predicting new links that may form between network\nnodes. Despite the clear importance of link prediction, existing models have\nsignificant shortcomings. Graph Convolutional Networks, for instance, have been\nproven to be highly efficient for link prediction on a variety of datasets.\nHowever, they encounter severe limitations when applied to short-path networks\nand ego networks, resulting in poor performance. This presents a critical\nproblem space that this work aims to address. In this paper, we present the\nNode Centrality and Similarity Based Parameterised Model (NCSM), a novel method\nfor link prediction tasks. NCSM uniquely integrates node centrality and\nsimilarity measures as edge features in a customised Graph Neural Network (GNN)\nlayer, effectively leveraging the topological information of large networks.\nThis model represents the first parameterised GNN-based link prediction model\nthat considers topological information. The proposed model was evaluated on\nfive benchmark graph datasets, each comprising thousands of nodes and edges.\nExperimental results highlight NCSM's superiority over existing\nstate-of-the-art models like Graph Convolutional Networks and Variational Graph\nAutoencoder, as it outperforms them across various metrics and datasets. This\nexceptional performance can be attributed to NCSM's innovative integration of\nnode centrality, similarity measures, and its efficient use of topological\ninformation.\n","authors":["Haohui Lu","Shahadat Uddin"],"pdf_url":"https://arxiv.org/pdf/2309.05434v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05430v1","updated":"2023-09-11T13:06:19Z","published":"2023-09-11T13:06:19Z","title":"Neuromorphic Auditory Perception by Neural Spiketrum","summary":" Neuromorphic computing holds the promise to achieve the energy efficiency and\nrobust learning performance of biological neural systems. To realize the\npromised brain-like intelligence, it needs to solve the challenges of the\nneuromorphic hardware architecture design of biological neural substrate and\nthe hardware amicable algorithms with spike-based encoding and learning. Here\nwe introduce a neural spike coding model termed spiketrum, to characterize and\ntransform the time-varying analog signals, typically auditory signals, into\ncomputationally efficient spatiotemporal spike patterns. It minimizes the\ninformation loss occurring at the analog-to-spike transformation and possesses\ninformational robustness to neural fluctuations and spike losses. The model\nprovides a sparse and efficient coding scheme with precisely controllable spike\nrate that facilitates training of spiking neural networks in various auditory\nperception tasks. We further investigate the algorithm-hardware co-designs\nthrough a neuromorphic cochlear prototype which demonstrates that our approach\ncan provide a systematic solution for spike-based artificial intelligence by\nfully exploiting its advantages with spike-based computation.\n","authors":["Huajin Tang","Pengjie Gu","Jayawan Wijekoon","MHD Anas Alsakkal","Ziming Wang","Jiangrong Shen","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2309.05430v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2301.11936v2","updated":"2023-09-11T13:00:29Z","published":"2023-01-27T19:00:00Z","title":"Quantum Ridgelet Transform: Winning Lottery Ticket of Neural Networks\n with Quantum Computation","summary":" A significant challenge in the field of quantum machine learning (QML) is to\nestablish applications of quantum computation to accelerate common tasks in\nmachine learning such as those for neural networks. Ridgelet transform has been\na fundamental mathematical tool in the theoretical studies of neural networks,\nbut the practical applicability of ridgelet transform to conducting learning\ntasks was limited since its numerical implementation by conventional classical\ncomputation requires an exponential runtime $\\exp(O(D))$ as data dimension $D$\nincreases. To address this problem, we develop a quantum ridgelet transform\n(QRT), which implements the ridgelet transform of a quantum state within a\nlinear runtime $O(D)$ of quantum computation. As an application, we also show\nthat one can use QRT as a fundamental subroutine for QML to efficiently find a\nsparse trainable subnetwork of large shallow wide neural networks without\nconducting large-scale optimization of the original network. This application\ndiscovers an efficient way in this regime to demonstrate the lottery ticket\nhypothesis on finding such a sparse trainable neural network. These results\nopen an avenue of QML for accelerating learning tasks with commonly used\nclassical neural networks.\n","authors":["Hayata Yamasaki","Sathyawageeswar Subramanian","Satoshi Hayakawa","Sho Sonoda"],"pdf_url":"https://arxiv.org/pdf/2301.11936v2.pdf","comment":"27 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.03224v2","updated":"2023-09-11T12:50:49Z","published":"2023-09-01T13:10:54Z","title":"No Train Still Gain. Unleash Mathematical Reasoning of Large Language\n Models with Monte Carlo Tree Search Guided by Energy Function","summary":" Large language models (LLMs) demonstrate impressive language understanding\nand contextual learning abilities, making them suitable for natural language\nprocessing (NLP) tasks and complex mathematical reasoning. However, when\napplied to mathematical reasoning tasks, LLMs often struggle to generate\ncorrect reasoning steps and answers despite having high probabilities for the\nsolutions. To overcome this limitation and enhance the mathematical reasoning\ncapabilities of fine-tuned LLMs without additional fine-tuning steps, we\npropose a method that incorporates Monte Carlo Tree Search (MCTS) and a\nlightweight energy function to rank decision steps and enable immediate\nreaction and precise reasoning. Specifically, we re-formulate the fine-tuned\nLLMs into a Residual-based Energy Model (Residual-EBM) and employ noise\ncontrastive estimation to estimate the energy function's parameters. We then\nutilize MCTS with the energy function as a path verifier to search the output\nspace and evaluate the reasoning path. Through extensive experiments on two\nmathematical reasoning benchmarks, GSM8k and AQUA-RAT, we demonstrate the\nexceptional capabilities of our method, which significantly improves the pass@1\nmetric of the fine-tuned model without requiring additional fine-tuning or\nreinforcement learning with human feedback alignment.\n","authors":["Haotian Xu"],"pdf_url":"https://arxiv.org/pdf/2309.03224v2.pdf","comment":"still in progress"},{"id":"http://arxiv.org/abs/2309.05413v1","updated":"2023-09-11T12:26:36Z","published":"2023-09-11T12:26:36Z","title":"Learning noise-induced transitions by multi-scaling reservoir computing","summary":" Noise is usually regarded as adversarial to extract the effective dynamics\nfrom time series, such that the conventional data-driven approaches usually aim\nat learning the dynamics by mitigating the noisy effect. However, noise can\nhave a functional role of driving transitions between stable states underlying\nmany natural and engineered stochastic dynamics. To capture such stochastic\ntransitions from data, we find that leveraging a machine learning model,\nreservoir computing as a type of recurrent neural network, can learn\nnoise-induced transitions. We develop a concise training protocol for tuning\nhyperparameters, with a focus on a pivotal hyperparameter controlling the time\nscale of the reservoir dynamics. The trained model generates accurate\nstatistics of transition time and the number of transitions. The approach is\napplicable to a wide class of systems, including a bistable system under a\ndouble-well potential, with either white noise or colored noise. It is also\naware of the asymmetry of the double-well potential, the rotational dynamics\ncaused by non-detailed balance, and transitions in multi-stable systems. For\nthe experimental data of protein folding, it learns the transition time between\nfolded states, providing a possibility of predicting transition statistics from\na small dataset. The results demonstrate the capability of machine-learning\nmethods in capturing noise-induced phenomena.\n","authors":["Zequn Lin","Zhaofan Lu","Zengru Di","Ying Tang"],"pdf_url":"https://arxiv.org/pdf/2309.05413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16325v2","updated":"2023-09-11T12:15:09Z","published":"2023-05-10T13:03:06Z","title":"Graph Neural Network Interatomic Potential Ensembles with Calibrated\n Aleatoric and Epistemic Uncertainty on Energy and Forces","summary":" Inexpensive machine learning potentials are increasingly being used to speed\nup structural optimization and molecular dynamics simulations of materials by\niteratively predicting and applying interatomic forces. In these settings, it\nis crucial to detect when predictions are unreliable to avoid wrong or\nmisleading results. Here, we present a complete framework for training and\nrecalibrating graph neural network ensemble models to produce accurate\npredictions of energy and forces with calibrated uncertainty estimates. The\nproposed method considers both epistemic and aleatoric uncertainty and the\ntotal uncertainties are recalibrated post hoc using a nonlinear scaling\nfunction to achieve good calibration on previously unseen data, without loss of\npredictive accuracy. The method is demonstrated and evaluated on two\nchallenging, publicly available datasets, ANI-1x (Smith et al.) and\nTransition1x (Schreiner et al.), both containing diverse conformations far from\nequilibrium. A detailed analysis of the predictive performance and uncertainty\ncalibration is provided. In all experiments, the proposed method achieved low\nprediction error and good uncertainty calibration, with predicted uncertainty\ncorrelating with expected error, on energy and forces. To the best of our\nknowledge, the method presented in this paper is the first to consider a\ncomplete framework for obtaining calibrated epistemic and aleatoric uncertainty\npredictions on both energy and forces in ML potentials.\n","authors":["Jonas Busk","Mikkel N. Schmidt","Ole Winther","Tejs Vegge","Peter Bjørn Jørgensen"],"pdf_url":"https://arxiv.org/pdf/2305.16325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05404v1","updated":"2023-09-11T12:10:19Z","published":"2023-09-11T12:10:19Z","title":"Physics-informed reinforcement learning via probabilistic co-adjustment\n functions","summary":" Reinforcement learning of real-world tasks is very data inefficient, and\nextensive simulation-based modelling has become the dominant approach for\ntraining systems. However, in human-robot interaction and many other real-world\nsettings, there is no appropriate one-model-for-all due to differences in\nindividual instances of the system (e.g. different people) or necessary\noversimplifications in the simulation models. This requires two approaches: 1.\neither learning the individual system's dynamics approximately from data which\nrequires data-intensive training or 2. using a complete digital twin of the\ninstances, which may not be realisable in many cases. We introduce two\napproaches: co-kriging adjustments (CKA) and ridge regression adjustment (RRA)\nas novel ways to combine the advantages of both approaches. Our adjustment\nmethods are based on an auto-regressive AR1 co-kriging model that we integrate\nwith GP priors. This yield a data- and simulation-efficient way of using\nsimplistic simulation models (e.g., simple two-link model) and rapidly adapting\nthem to individual instances (e.g., biomechanics of individual people). Using\nCKA and RRA, we obtain more accurate uncertainty quantification of the entire\nsystem's dynamics than pure GP-based and AR1 methods. We demonstrate the\nefficiency of co-kriging adjustment with an interpretable reinforcement\nlearning control example, learning to control a biomechanical human arm using\nonly a two-link arm simulation model (offline part) and CKA derived from a\nsmall amount of interaction data (on-the-fly online). Our method unlocks an\nefficient and uncertainty-aware way to implement reinforcement learning methods\nin real world complex systems for which only imperfect simulation models exist.\n","authors":["Nat Wannawas","A. Aldo Faisal"],"pdf_url":"https://arxiv.org/pdf/2309.05404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05395v1","updated":"2023-09-11T11:54:42Z","published":"2023-09-11T11:54:42Z","title":"Practical Homomorphic Aggregation for Byzantine ML","summary":" Due to the large-scale availability of data, machine learning (ML) algorithms\nare being deployed in distributed topologies, where different nodes collaborate\nto train ML models over their individual data by exchanging model-related\ninformation (e.g., gradients) with a central server. However, distributed\nlearning schemes are notably vulnerable to two threats. First, Byzantine nodes\ncan single-handedly corrupt the learning by sending incorrect information to\nthe server, e.g., erroneous gradients. The standard approach to mitigate such\nbehavior is to use a non-linear robust aggregation method at the server.\nSecond, the server can violate the privacy of the nodes. Recent attacks have\nshown that exchanging (unencrypted) gradients enables a curious server to\nrecover the totality of the nodes' data. The use of homomorphic encryption\n(HE), a gold standard security primitive, has extensively been studied as a\nprivacy-preserving solution to distributed learning in non-Byzantine scenarios.\nHowever, due to HE's large computational demand especially for high-dimensional\nML models, there has not yet been any attempt to design purely homomorphic\noperators for non-linear robust aggregators. In this work, we present SABLE,\nthe first completely homomorphic and Byzantine robust distributed learning\nalgorithm. SABLE essentially relies on a novel plaintext encoding method that\nenables us to implement the robust aggregator over batching-friendly BGV.\nMoreover, this encoding scheme also accelerates state-of-the-art homomorphic\nsorting with larger security margins and smaller ciphertext size. We perform\nextensive experiments on image classification tasks and show that our algorithm\nachieves practical execution times while matching the ML performance of its\nnon-private counterpart.\n","authors":["Antoine Choffrut","Rachid Guerraoui","Rafael Pinot","Renaud Sirdey","John Stephan","Martin Zuber"],"pdf_url":"https://arxiv.org/pdf/2309.05395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05391v1","updated":"2023-09-11T11:42:28Z","published":"2023-09-11T11:42:28Z","title":"Career Path Recommendations for Long-term Income Maximization: A\n Reinforcement Learning Approach","summary":" This study explores the potential of reinforcement learning algorithms to\nenhance career planning processes. Leveraging data from Randstad The\nNetherlands, the study simulates the Dutch job market and develops strategies\nto optimize employees' long-term income. By formulating career planning as a\nMarkov Decision Process (MDP) and utilizing machine learning algorithms such as\nSarsa, Q-Learning, and A2C, we learn optimal policies that recommend career\npaths with high-income occupations and industries. The results demonstrate\nsignificant improvements in employees' income trajectories, with RL models,\nparticularly Q-Learning and Sarsa, achieving an average increase of 5% compared\nto observed career paths. The study acknowledges limitations, including narrow\njob filtering, simplifications in the environment formulation, and assumptions\nregarding employment continuity and zero application costs. Future research can\nexplore additional objectives beyond income optimization and address these\nlimitations to further enhance career planning processes.\n","authors":["Spyros Avlonitis","Dor Lavi","Masoud Mansoury","David Graus"],"pdf_url":"https://arxiv.org/pdf/2309.05391v1.pdf","comment":"accepted for publication at RecSys in HR '23 (at the 17th ACM\n Conference on Recommender Systems)"},{"id":"http://arxiv.org/abs/2211.02678v3","updated":"2023-09-11T11:36:04Z","published":"2022-10-27T14:24:48Z","title":"Efficient ECG-based Atrial Fibrillation Detection via Parameterised\n Hypercomplex Neural Networks","summary":" Atrial fibrillation (AF) is the most common cardiac arrhythmia and associated\nwith a high risk for serious conditions like stroke. The use of wearable\ndevices embedded with automatic and timely AF assessment from\nelectrocardiograms (ECGs) has shown to be promising in preventing\nlife-threatening situations. Although deep neural networks have demonstrated\nsuperiority in model performance, their use on wearable devices is limited by\nthe trade-off between model performance and complexity. In this work, we\npropose to use lightweight convolutional neural networks (CNNs) with\nparameterised hypercomplex (PH) layers for AF detection based on ECGs. The\nproposed approach trains small-scale CNNs, thus overcoming the limited\ncomputing resources on wearable devices. We show comparable performance to\ncorresponding real-valued CNNs on two publicly available ECG datasets using\nsignificantly fewer model parameters. PH models are more flexible than other\nhypercomplex neural networks and can operate on any number of input ECG leads.\n","authors":["Leonie Basso","Zhao Ren","Wolfgang Nejdl"],"pdf_url":"https://arxiv.org/pdf/2211.02678v3.pdf","comment":"Published at EUSIPCO 2023"},{"id":"http://arxiv.org/abs/2303.06053v5","updated":"2023-09-11T11:19:49Z","published":"2023-03-10T16:41:24Z","title":"TSMixer: An All-MLP Architecture for Time Series Forecasting","summary":" Real-world time-series datasets are often multivariate with complex dynamics.\nTo capture this complexity, high capacity architectures like recurrent- or\nattention-based sequential deep learning models have become popular. However,\nrecent work demonstrates that simple univariate linear models can outperform\nsuch deep learning models on several commonly used academic benchmarks.\nExtending them, in this paper, we investigate the capabilities of linear models\nfor time-series forecasting and present Time-Series Mixer (TSMixer), a novel\narchitecture designed by stacking multi-layer perceptrons (MLPs). TSMixer is\nbased on mixing operations along both the time and feature dimensions to\nextract information efficiently. On popular academic benchmarks, the\nsimple-to-implement TSMixer is comparable to specialized state-of-the-art\nmodels that leverage the inductive biases of specific benchmarks. On the\nchallenging and large scale M5 benchmark, a real-world retail dataset, TSMixer\ndemonstrates superior performance compared to the state-of-the-art\nalternatives. Our results underline the importance of efficiently utilizing\ncross-variate and auxiliary information for improving the performance of time\nseries forecasting. We present various analyses to shed light into the\ncapabilities of TSMixer. The design paradigms utilized in TSMixer are expected\nto open new horizons for deep learning-based time series forecasting. The\nimplementation is available at\nhttps://github.com/google-research/google-research/tree/master/tsmixer\n","authors":["Si-An Chen","Chun-Liang Li","Nate Yoder","Sercan O. Arik","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2303.06053v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05386v1","updated":"2023-09-11T11:18:16Z","published":"2023-09-11T11:18:16Z","title":"Data-Driven Model Reduction and Nonlinear Model Predictive Control of an\n Air Separation Unit by Applied Koopman Theory","summary":" Achieving real-time capability is an essential prerequisite for the\nindustrial implementation of nonlinear model predictive control (NMPC).\nData-driven model reduction offers a way to obtain low-order control models\nfrom complex digital twins. In particular, data-driven approaches require\nlittle expert knowledge of the particular process and its model, and provide\nreduced models of a well-defined generic structure. Herein, we apply our\nrecently proposed data-driven reduction strategy based on Koopman theory\n[Schulze et al. (2022), Comput. Chem. Eng.] to generate a low-order control\nmodel of an air separation unit (ASU). The reduced Koopman model combines\nautoencoders and linear latent dynamics and is constructed using machine\nlearning. Further, we present an NMPC implementation that uses derivative\ncomputation tailored to the fixed block structure of reduced Koopman models.\nOur reduction approach with tailored NMPC implementation enables real-time NMPC\nof an ASU at an average CPU time decrease by 98 %.\n","authors":["Jan C. Schulze","Danimir T. Doncevic","Nils Erwes","Alexander Mitsos"],"pdf_url":"https://arxiv.org/pdf/2309.05386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.08455v3","updated":"2023-09-11T10:36:41Z","published":"2023-05-15T08:54:32Z","title":"Document Understanding Dataset and Evaluation (DUDE)","summary":" We call on the Document AI (DocAI) community to reevaluate current\nmethodologies and embrace the challenge of creating more practically-oriented\nbenchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to\nremediate the halted research progress in understanding visually-rich documents\n(VRDs). We present a new dataset with novelties related to types of questions,\nanswers, and document layouts based on multi-industry, multi-domain, and\nmulti-page VRDs of various origins, and dates. Moreover, we are pushing the\nboundaries of current methods by creating multi-task and multi-domain\nevaluation setups that more accurately simulate real-world situations where\npowerful generalization and adaptation under low-resource settings are desired.\nDUDE aims to set a new standard as a more practical, long-standing benchmark\nfor the community, and we hope that it will lead to future extensions and\ncontributions that address real-world challenges. Finally, our work illustrates\nthe importance of finding more efficient ways to model language, images, and\nlayout in DocAI.\n","authors":["Jordy Van Landeghem","Rubén Tito","Łukasz Borchmann","Michał Pietruszka","Paweł Józiak","Rafał Powalski","Dawid Jurkiewicz","Mickaël Coustaty","Bertrand Ackaert","Ernest Valveny","Matthew Blaschko","Sien Moens","Tomasz Stanisławek"],"pdf_url":"https://arxiv.org/pdf/2305.08455v3.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2308.07491v2","updated":"2023-09-11T10:32:02Z","published":"2023-08-14T22:58:54Z","title":"Adaptive Tracking of a Single-Rigid-Body Character in Various\n Environments","summary":" Since the introduction of DeepMimic [Peng et al. 2018], subsequent research\nhas focused on expanding the repertoire of simulated motions across various\nscenarios. In this study, we propose an alternative approach for this goal, a\ndeep reinforcement learning method based on the simulation of a\nsingle-rigid-body character. Using the centroidal dynamics model (CDM) to\nexpress the full-body character as a single rigid body (SRB) and training a\npolicy to track a reference motion, we can obtain a policy that is capable of\nadapting to various unobserved environmental changes and controller transitions\nwithout requiring any additional learning. Due to the reduced dimension of\nstate and action space, the learning process is sample-efficient. The final\nfull-body motion is kinematically generated in a physically plausible way,\nbased on the state of the simulated SRB character. The SRB simulation is\nformulated as a quadratic programming (QP) problem, and the policy outputs an\naction that allows the SRB character to follow the reference motion. We\ndemonstrate that our policy, efficiently trained within 30 minutes on an\nultraportable laptop, has the ability to cope with environments that have not\nbeen experienced during learning, such as running on uneven terrain or pushing\na box, and transitions between learned policies, without any additional\nlearning.\n","authors":["Taesoo Kwon","Taehong Gu","Jaewon Ahn","Yoonsang Lee"],"pdf_url":"https://arxiv.org/pdf/2308.07491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05361v1","updated":"2023-09-11T10:13:30Z","published":"2023-09-11T10:13:30Z","title":"Feature-based Transferable Disruption Prediction for future tokamaks\n using domain adaptation","summary":" The high acquisition cost and the significant demand for disruptive\ndischarges for data-driven disruption prediction models in future tokamaks pose\nan inherent contradiction in disruption prediction research. In this paper, we\ndemonstrated a novel approach to predict disruption in a future tokamak only\nusing a few discharges based on a domain adaptation algorithm called CORAL. It\nis the first attempt at applying domain adaptation in the disruption prediction\ntask. In this paper, this disruption prediction approach aligns a few data from\nthe future tokamak (target domain) and a large amount of data from the existing\ntokamak (source domain) to train a machine learning model in the existing\ntokamak. To simulate the existing and future tokamak case, we selected J-TEXT\nas the existing tokamak and EAST as the future tokamak. To simulate the lack of\ndisruptive data in future tokamak, we only selected 100 non-disruptive\ndischarges and 10 disruptive discharges from EAST as the target domain training\ndata. We have improved CORAL to make it more suitable for the disruption\nprediction task, called supervised CORAL. Compared to the model trained by\nmixing data from the two tokamaks, the supervised CORAL model can enhance the\ndisruption prediction performance for future tokamaks (AUC value from 0.764 to\n0.890). Through interpretable analysis, we discovered that using the supervised\nCORAL enables the transformation of data distribution to be more similar to\nfuture tokamak. An assessment method for evaluating whether a model has learned\na trend of similar features is designed based on SHAP analysis. It demonstrates\nthat the supervised CORAL model exhibits more similarities to the model trained\non large data sizes of EAST. FTDP provides a light, interpretable, and\nfew-data-required way by aligning features to predict disruption using small\ndata sizes from the future tokamak.\n","authors":["Chengshuo Shen","Wei Zheng","Bihao Guo","Dalong Chen","Xinkun Ai","Fengming Xue","Yu Zhong","Nengchao Wang","Biao Shen","Binjia Xiao","Yonghua Ding","Zhongyong Chen","Yuan Pan","J-TEXT team"],"pdf_url":"https://arxiv.org/pdf/2309.05361v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.05357v1","updated":"2023-09-11T10:07:51Z","published":"2023-09-11T10:07:51Z","title":"EDAC: Efficient Deployment of Audio Classification Models For COVID-19\n Detection","summary":" The global spread of COVID-19 had severe consequences for public health and\nthe world economy. The quick onset of the pandemic highlighted the potential\nbenefits of cheap and deployable pre-screening methods to monitor the\nprevalence of the disease in a population. Various researchers made use of\nmachine learning methods in an attempt to detect COVID-19. The solutions\nleverage various input features, such as CT scans or cough audio signals, with\nstate-of-the-art results arising from deep neural network architectures.\nHowever, larger models require more compute; a pertinent consideration when\ndeploying to the edge. To address this, we first recreated two models that use\ncough audio recordings to detect COVID-19. Through applying network pruning and\nquantisation, we were able to compress these two architectures without reducing\nthe model's predictive performance. Specifically, we were able to achieve an\n105.76x and an 19.34x reduction in the compressed model file size with\ncorresponding 1.37x and 1.71x reductions in the inference times of the two\nmodels.\n","authors":["Andrej Jovanović","Mario Mihaly","Lennon Donaldson"],"pdf_url":"https://arxiv.org/pdf/2309.05357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05352v1","updated":"2023-09-11T09:53:28Z","published":"2023-09-11T09:53:28Z","title":"Neural Discovery of Permutation Subgroups","summary":" We consider the problem of discovering subgroup $H$ of permutation group\n$S_{n}$. Unlike the traditional $H$-invariant networks wherein $H$ is assumed\nto be known, we present a method to discover the underlying subgroup, given\nthat it satisfies certain conditions. Our results show that one could discover\nany subgroup of type $S_{k} (k \\leq n)$ by learning an $S_{n}$-invariant\nfunction and a linear transformation. We also prove similar results for cyclic\nand dihedral subgroups. Finally, we provide a general theorem that can be\nextended to discover other subgroups of $S_{n}$. We also demonstrate the\napplicability of our results through numerical experiments on image-digit sum\nand symmetric polynomial regression tasks.\n","authors":["Pavan Karjol","Rohan Kashyap","Prathosh A P"],"pdf_url":"https://arxiv.org/pdf/2309.05352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05346v1","updated":"2023-09-11T09:45:22Z","published":"2023-09-11T09:45:22Z","title":"Learning Geometric Representations of Objects via Interaction","summary":" We address the problem of learning representations from observations of a\nscene involving an agent and an external object the agent interacts with. To\nthis end, we propose a representation learning framework extracting the\nlocation in physical space of both the agent and the object from unstructured\nobservations of arbitrary nature. Our framework relies on the actions performed\nby the agent as the only source of supervision, while assuming that the object\nis displaced by the agent via unknown dynamics. We provide a theoretical\nfoundation and formally prove that an ideal learner is guaranteed to infer an\nisometric representation, disentangling the agent from the object and correctly\nextracting their locations. We evaluate empirically our framework on a variety\nof scenarios, showing that it outperforms vision-based approaches such as a\nstate-of-the-art keypoint extractor. We moreover demonstrate how the extracted\nrepresentations enable the agent to solve downstream tasks via reinforcement\nlearning in an efficient manner.\n","authors":["Alfredo Reichlin","Giovanni Luca Marchetti","Hang Yin","Anastasiia Varava","Danica Kragic"],"pdf_url":"https://arxiv.org/pdf/2309.05346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05343v1","updated":"2023-09-11T09:43:59Z","published":"2023-09-11T09:43:59Z","title":"A DRL-based Reflection Enhancement Method for RIS-assisted\n Multi-receiver Communications","summary":" In reconfigurable intelligent surface (RIS)-assisted wireless communication\nsystems, the pointing accuracy and intensity of reflections depend crucially on\nthe 'profile,' representing the amplitude/phase state information of all\nelements in a RIS array. The superposition of multiple single-reflection\nprofiles enables multi-reflection for distributed users. However, the\noptimization challenges from periodic element arrangements in single-reflection\nand multi-reflection profiles are understudied. The combination of periodical\nsingle-reflection profiles leads to amplitude/phase counteractions, affecting\nthe performance of each reflection beam. This paper focuses on a\ndual-reflection optimization scenario and investigates the far-field\nperformance deterioration caused by the misalignment of overlapped profiles. To\naddress this issue, we introduce a novel deep reinforcement learning\n(DRL)-based optimization method. Comparative experiments against random and\nexhaustive searches demonstrate that our proposed DRL method outperforms both\nalternatives, achieving the shortest optimization time. Remarkably, our\napproach achieves a 1.2 dB gain in the reflection peak gain and a broader beam\nwithout any hardware modifications.\n","authors":["Wei Wang","Peizheng Li","Angela Doufexi","Mark A Beach"],"pdf_url":"https://arxiv.org/pdf/2309.05343v1.pdf","comment":"6 pages, 6 figures. This paper has been accepted for presentation at\n the VTC2023-Fall"},{"id":"http://arxiv.org/abs/2011.05001v8","updated":"2023-09-11T09:42:27Z","published":"2020-11-10T09:32:50Z","title":"MMD-Regularized Unbalanced Optimal Transport","summary":" We study the unbalanced optimal transport (UOT) problem, where the marginal\nconstraints are enforced using Maximum Mean Discrepancy (MMD) regularization.\nOur work is motivated by the observation that the literature on UOT is focused\non regularization based on $\\phi$-divergence (e.g., KL divergence). Despite the\npopularity of MMD, its role as a regularizer in the context of UOT seems less\nunderstood. We begin by deriving a specific dual of MMD-regularized UOT\n(MMD-UOT), which helps us prove several useful properties. One interesting\noutcome of this duality result is that MMD-UOT induces novel metrics, which not\nonly lift the ground metric like the Wasserstein but are also sample-wise\nefficient to estimate like the MMD. Further, for real-world applications\ninvolving non-discrete measures, we present an estimator for the transport plan\nthat is supported only on the given ($m$) samples. Under mild conditions, we\nprove that the estimation error with this finitely-supported transport plan is\nalso $\\mathcal{O}(1/\\sqrt{m})$. As far as we know, such error bounds that are\nfree from the curse of dimensionality are not known for $\\phi$-divergence\nregularized UOT. Finally, we discuss how the proposed estimator can be computed\nefficiently using accelerated gradient descent. Our experiments show that\nMMD-UOT consistently outperforms popular baselines, including KL-regularized\nUOT and MMD, in diverse machine learning applications.\n","authors":["Piyushi Manupriya","J. Saketha Nath","Pratik Jawanpuria"],"pdf_url":"https://arxiv.org/pdf/2011.05001v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05339v1","updated":"2023-09-11T09:35:51Z","published":"2023-09-11T09:35:51Z","title":"PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D\n representations for agricultural robotics","summary":" Precise scene understanding is key for most robot monitoring and intervention\ntasks in agriculture. In this work we present PAg-NeRF which is a novel\nNeRF-based system that enables 3D panoptic scene understanding. Our\nrepresentation is trained using an image sequence with noisy robot odometry\nposes and automatic panoptic predictions with inconsistent IDs between frames.\nDespite this noisy input, our system is able to output scene geometry,\nphoto-realistic renders and 3D consistent panoptic representations with\nconsistent instance IDs. We evaluate this novel system in a very challenging\nhorticultural scenario and in doing so demonstrate an end-to-end trainable\nsystem that can make use of noisy robot poses rather than precise poses that\nhave to be pre-calculated. Compared to a baseline approach the peak signal to\nnoise ratio is improved from 21.34dB to 23.37dB while the panoptic quality\nimproves from 56.65% to 70.08%. Furthermore, our approach is faster and can be\ntuned to improve inference time by more than a factor of 2 while being memory\nefficient with approximately 12 times fewer parameters.\n","authors":["Claus Smitt","Michael Halstead","Patrick Zimmer","Thomas Läbe","Esra Guclu","Cyrill Stachniss","Chris McCool"],"pdf_url":"https://arxiv.org/pdf/2309.05339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05337v1","updated":"2023-09-11T09:34:44Z","published":"2023-09-11T09:34:44Z","title":"Stochastic Gradient Descent-like relaxation is equivalent to Glauber\n dynamics in discrete optimization and inference problems","summary":" Is Stochastic Gradient Descent (SGD) substantially different from Glauber\ndynamics? This is a fundamental question at the time of understanding the most\nused training algorithm in the field of Machine Learning, but it received no\nanswer until now. Here we show that in discrete optimization and inference\nproblems, the dynamics of an SGD-like algorithm resemble very closely that of\nMetropolis Monte Carlo with a properly chosen temperature, which depends on the\nmini-batch size. This quantitative matching holds both at equilibrium and in\nthe out-of-equilibrium regime, despite the two algorithms having fundamental\ndifferences (e.g.\\ SGD does not satisfy detailed balance). Such equivalence\nallows us to use results about performances and limits of Monte Carlo\nalgorithms to optimize the mini-batch size in the SGD-like algorithm and make\nit efficient at recovering the signal in hard inference problems.\n","authors":["Maria Chiara Angelini","Angelo Giorgio Cavaliere","Raffaele Marino","Federico Ricci-Tersenghi"],"pdf_url":"https://arxiv.org/pdf/2309.05337v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.19958v2","updated":"2023-09-11T09:14:49Z","published":"2023-05-31T15:44:55Z","title":"Analysing high resolution digital Mars images using machine learning","summary":" The search for ephemeral liquid water on Mars is an ongoing activity. After\nthe recession of the seasonal polar ice cap on Mars, small water ice patches\nmay be left behind in shady places due to the low thermal conductivity of the\nMartian surface and atmosphere. During late spring and early summer, these\npatches may be exposed to direct sunlight and warm up rapidly enough for the\nliquid phase to emerge. To see the spatial and temporal occurrence of such ice\npatches, optical images should be searched for and checked. Previously a manual\nimage analysis was conducted on 110 images from the southern hemisphere,\ncaptured by the High Resolution Imaging Science Experiment (HiRISE) camera\nonboard the Mars Reconnaissance Orbiter space mission. Out of these, 37 images\nwere identified with smaller ice patches, which were distinguishable by their\nbrightness, colour and strong connection to local topographic shading. In this\nstudy, a convolutional neural network (CNN) is applied to find further images\nwith potential water ice patches in the latitude band between -40{\\deg} and\n-60{\\deg}, where the seasonal retreat of the polar ice cap happens. Previously\nanalysed HiRISE images were used to train the model, where each image was split\ninto hundreds of pieces (chunks), expanding the training dataset to 6240\nimages. A test run conducted on 38 new HiRISE images indicates that the program\ncan generally recognise small bright patches, however further training might be\nneeded for more precise identification. This further training has been\nconducted now, incorporating the results of the previous test run. To retrain\nthe model, 18646 chunks were analysed and 48 additional epochs were ran. In the\nend the model produced a 94% accuracy in recognising ice, 58% of these images\nshowed small enough ice patches on them. The rest of the images was covered by\ntoo much ice or showed CO2 ice sublimation in some places.\n","authors":["Mira Gergácz","Ákos Kereszturi"],"pdf_url":"https://arxiv.org/pdf/2305.19958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05317v1","updated":"2023-09-11T09:04:36Z","published":"2023-09-11T09:04:36Z","title":"Neural Koopman prior for data assimilation","summary":" With the increasing availability of large scale datasets, computational power\nand tools like automatic differentiation and expressive neural network\narchitectures, sequential data are now often treated in a data-driven way, with\na dynamical model trained from the observation data. While neural networks are\noften seen as uninterpretable black-box architectures, they can still benefit\nfrom physical priors on the data and from mathematical knowledge. In this\npaper, we use a neural network architecture which leverages the long-known\nKoopman operator theory to embed dynamical systems in latent spaces where their\ndynamics can be described linearly, enabling a number of appealing features. We\nintroduce methods that enable to train such a model for long-term continuous\nreconstruction, even in difficult contexts where the data comes in\nirregularly-sampled time series. The potential for self-supervised learning is\nalso demonstrated, as we show the promising use of trained dynamical models as\npriors for variational data assimilation techniques, with applications to e.g.\ntime series interpolation and forecasting.\n","authors":["Anthony Frion","Lucas Drumetz","Mauro Dalla Mura","Guillaume Tochon","Abdeldjalil Aïssa El Bey"],"pdf_url":"https://arxiv.org/pdf/2309.05317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05305v1","updated":"2023-09-11T08:44:07Z","published":"2023-09-11T08:44:07Z","title":"Fully-Connected Spatial-Temporal Graph for Multivariate Time Series Data","summary":" Multivariate Time-Series (MTS) data is crucial in various application fields.\nWith its sequential and multi-source (multiple sensors) properties, MTS data\ninherently exhibits Spatial-Temporal (ST) dependencies, involving temporal\ncorrelations between timestamps and spatial correlations between sensors in\neach timestamp. To effectively leverage this information, Graph Neural\nNetwork-based methods (GNNs) have been widely adopted. However, existing\napproaches separately capture spatial dependency and temporal dependency and\nfail to capture the correlations between Different sEnsors at Different\nTimestamps (DEDT). Overlooking such correlations hinders the comprehensive\nmodelling of ST dependencies within MTS data, thus restricting existing GNNs\nfrom learning effective representations. To address this limitation, we propose\na novel method called Fully-Connected Spatial-Temporal Graph Neural Network\n(FC-STGNN), including two key components namely FC graph construction and FC\ngraph convolution. For graph construction, we design a decay graph to connect\nsensors across all timestamps based on their temporal distances, enabling us to\nfully model the ST dependencies by considering the correlations between DEDT.\nFurther, we devise FC graph convolution with a moving-pooling GNN layer to\neffectively capture the ST dependencies for learning effective representations.\nExtensive experiments show the effectiveness of FC-STGNN on multiple MTS\ndatasets compared to SOTA methods.\n","authors":["Yucheng Wang","Yuecong Xu","Jianfei Yang","Min Wu","Xiaoli Li","Lihua Xie","Zhenghua Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05305v1.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2308.13566v2","updated":"2023-09-11T08:28:40Z","published":"2023-08-25T01:41:04Z","title":"MLLM-DataEngine: An Iterative Refinement Approach for MLLM","summary":" Despite the great advance of Multimodal Large Language Models (MLLMs) in both\ninstruction dataset building and benchmarking, the independence of training and\nevaluation makes current MLLMs hard to further improve their capability under\nthe guidance of evaluation results with a relatively low human cost. In this\npaper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data\ngeneration, model training, and evaluation. Within each loop iteration, the\nMLLM-DataEngine first analyze the weakness of the model based on the evaluation\nresults, then generate a proper incremental dataset for the next training\niteration and enhance the model capability iteratively. Compared with previous\ndata collection methods which are separate from the benchmarking, the data\ngenerated by MLLM-DataEngine shows better targeting, quality, and correctness.\nFor targeting, we propose an Adaptive Bad-case Sampling module, which adjusts\nthe ratio of different types of data within each incremental dataset based on\nthe benchmarking results. For quality, we resort to GPT-4 to generate\nhigh-quality data with each given data type. For correctness, prompt design is\ncritical for the data generation results. Rather than previous hand-crafted\nprompt, we propose an Interactive Prompt Optimization strategy, which optimizes\nthe prompt with the multi-round interaction between human and GPT, and improve\nthe correctness of generated data greatly. Through extensive experiments, we\nfind our MLLM-DataEngine could boost the MLLM capability in a targeted and\nautomatic manner, with only a few human participation. We hope it could be a\ngeneral solution for the following MLLMs building. The MLLM-DataEngine has been\nopen-sourced and is now available at\nhttps://github.com/opendatalab/MLLM-DataEngine.\n","authors":["Zhiyuan Zhao","Linke Ouyang","Bin Wang","Siyuan Huang","Pan Zhang","Xiaoyi Dong","Jiaqi Wang","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2308.13566v2.pdf","comment":"Code and models are available at\n https://github.com/opendatalab/MLLM-DataEngine"},{"id":"http://arxiv.org/abs/2309.05295v1","updated":"2023-09-11T08:26:08Z","published":"2023-09-11T08:26:08Z","title":"Discrete Denoising Diffusion Approach to Integer Factorization","summary":" Integer factorization is a famous computational problem unknown whether being\nsolvable in the polynomial time. With the rise of deep neural networks, it is\ninteresting whether they can facilitate faster factorization. We present an\napproach to factorization utilizing deep neural networks and discrete denoising\ndiffusion that works by iteratively correcting errors in a partially-correct\nsolution. To this end, we develop a new seq2seq neural network architecture,\nemploy relaxed categorical distribution and adapt the reverse diffusion process\nto cope better with inaccuracies in the denoising step. The approach is able to\nfind factors for integers of up to 56 bits long. Our analysis indicates that\ninvestment in training leads to an exponential decrease of sampling steps\nrequired at inference to achieve a given success rate, thus counteracting an\nexponential run-time increase depending on the bit-length.\n","authors":["Karlis Freivalds","Emils Ozolins","Guntis Barzdins"],"pdf_url":"https://arxiv.org/pdf/2309.05295v1.pdf","comment":"International Conference on Artificial Neural Networks ICANN 2023"},{"id":"http://arxiv.org/abs/2309.05292v1","updated":"2023-09-11T08:21:42Z","published":"2023-09-11T08:21:42Z","title":"The fine print on tempered posteriors","summary":" We conduct a detailed investigation of tempered posteriors and uncover a\nnumber of crucial and previously undiscussed points. Contrary to previous\nresults, we first show that for realistic models and datasets and the tightly\ncontrolled case of the Laplace approximation to the posterior, stochasticity\ndoes not in general improve test accuracy. The coldest temperature is often\noptimal. One might think that Bayesian models with some stochasticity can at\nleast obtain improvements in terms of calibration. However, we show empirically\nthat when gains are obtained this comes at the cost of degradation in test\naccuracy. We then discuss how targeting Frequentist metrics using Bayesian\nmodels provides a simple explanation of the need for a temperature parameter\n$\\lambda$ in the optimization objective. Contrary to prior works, we finally\nshow through a PAC-Bayesian analysis that the temperature $\\lambda$ cannot be\nseen as simply fixing a misspecified prior or likelihood.\n","authors":["Konstantinos Pitas","Julyan Arbel"],"pdf_url":"https://arxiv.org/pdf/2309.05292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00911v3","updated":"2023-09-11T07:41:52Z","published":"2023-02-02T06:59:15Z","title":"Conditional expectation with regularization for missing data imputation","summary":" Missing data frequently occurs in datasets across various domains, such as\nmedicine, sports, and finance. In many cases, to enable proper and reliable\nanalyses of such data, the missing values are often imputed, and it is\nnecessary that the method used has a low root mean square error (RMSE) between\nthe imputed and the true values. In addition, for some critical applications,\nit is also often a requirement that the imputation method is scalable and the\nlogic behind the imputation is explainable, which is especially difficult for\ncomplex methods that are, for example, based on deep learning. Based on these\nconsiderations, we propose a new algorithm named \"conditional\nDistribution-based Imputation of Missing Values with Regularization\" (DIMV).\nDIMV operates by determining the conditional distribution of a feature that has\nmissing entries, using the information from the fully observed features as a\nbasis. As will be illustrated via experiments in the paper, DIMV (i) gives a\nlow RMSE for the imputed values compared to state-of-the-art methods; (ii) fast\nand scalable; (iii) is explainable as coefficients in a regression model,\nallowing reliable and trustable analysis, makes it a suitable choice for\ncritical domains where understanding is important such as in medical fields,\nfinance, etc; (iv) can provide an approximated confidence region for the\nmissing values in a given sample; (v) suitable for both small and large scale\ndata; (vi) in many scenarios, does not require a huge number of parameters as\ndeep learning approaches; (vii) handle multicollinearity in imputation\neffectively; and (viii) is robust to the normally distributed assumption that\nits theoretical grounds rely on.\n","authors":["Mai Anh Vu","Thu Nguyen","Tu T. Do","Nhan Phan","Nitesh V. Chawla","Pål Halvorsen","Michael A. Riegler","Binh T. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2302.00911v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05282v1","updated":"2023-09-11T07:37:10Z","published":"2023-09-11T07:37:10Z","title":"Can you text what is happening? Integrating pre-trained language\n encoders into trajectory prediction models for autonomous driving","summary":" In autonomous driving tasks, scene understanding is the first step towards\npredicting the future behavior of the surrounding traffic participants. Yet,\nhow to represent a given scene and extract its features are still open research\nquestions. In this study, we propose a novel text-based representation of\ntraffic scenes and process it with a pre-trained language encoder.\n First, we show that text-based representations, combined with classical\nrasterized image representations, lead to descriptive scene embeddings. Second,\nwe benchmark our predictions on the nuScenes dataset and show significant\nimprovements compared to baselines. Third, we show in an ablation study that a\njoint encoder of text and rasterized images outperforms the individual encoders\nconfirming that both representations have their complementary strengths.\n","authors":["Ali Keysan","Andreas Look","Eitan Kosman","Gonca Gürsun","Jörg Wagner","Yao Yu","Barbara Rakitsch"],"pdf_url":"https://arxiv.org/pdf/2309.05282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05281v1","updated":"2023-09-11T07:36:16Z","published":"2023-09-11T07:36:16Z","title":"Class-Incremental Grouping Network for Continual Audio-Visual Learning","summary":" Continual learning is a challenging problem in which models need to be\ntrained on non-stationary data across sequential tasks for class-incremental\nlearning. While previous methods have focused on using either regularization or\nrehearsal-based frameworks to alleviate catastrophic forgetting in image\nclassification, they are limited to a single modality and cannot learn compact\nclass-aware cross-modal representations for continual audio-visual learning. To\naddress this gap, we propose a novel class-incremental grouping network (CIGN)\nthat can learn category-wise semantic features to achieve continual\naudio-visual learning. Our CIGN leverages learnable audio-visual class tokens\nand audio-visual grouping to continually aggregate class-aware features.\nAdditionally, it utilizes class tokens distillation and continual grouping to\nprevent forgetting parameters learned from previous tasks, thereby improving\nthe model's ability to capture discriminative audio-visual categories. We\nconduct extensive experiments on VGGSound-Instruments, VGGSound-100, and\nVGG-Sound Sources benchmarks. Our experimental results demonstrate that the\nCIGN achieves state-of-the-art audio-visual class-incremental learning\nperformance. Code is available at https://github.com/stoneMo/CIGN.\n","authors":["Shentong Mo","Weiguo Pian","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2309.05281v1.pdf","comment":"ICCV 2023. arXiv admin note: text overlap with arXiv:2303.17056"},{"id":"http://arxiv.org/abs/2210.06171v2","updated":"2023-09-11T07:27:05Z","published":"2022-10-11T03:47:14Z","title":"Learning to Optimize Quasi-Newton Methods","summary":" Fast gradient-based optimization algorithms have become increasingly\nessential for the computationally efficient training of machine learning\nmodels. One technique is to multiply the gradient by a preconditioner matrix to\nproduce a step, but it is unclear what the best preconditioner matrix is. This\npaper introduces a novel machine learning optimizer called LODO, which tries to\nonline meta-learn the best preconditioner during optimization. Specifically,\nour optimizer merges Learning to Optimize (L2O) techniques with quasi-Newton\nmethods to learn preconditioners parameterized as neural networks; they are\nmore flexible than preconditioners in other quasi-Newton methods. Unlike other\nL2O methods, LODO does not require any meta-training on a training task\ndistribution, and instead learns to optimize on the fly while optimizing on the\ntest task, adapting to the local characteristics of the loss landscape while\ntraversing it. Theoretically, we show that our optimizer approximates the\ninverse Hessian in noisy loss landscapes and is capable of representing a wide\nrange of inverse Hessians. We experimentally verify that our algorithm can\noptimize in noisy settings, and show that simpler alternatives for representing\nthe inverse Hessians worsen performance. Lastly, we use our optimizer to train\na semi-realistic deep neural network with 95k parameters at speeds comparable\nto those of standard neural network optimizers.\n","authors":["Isaac Liao","Rumen R. Dangovski","Jakob N. Foerster","Marin Soljačić"],"pdf_url":"https://arxiv.org/pdf/2210.06171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10404v2","updated":"2023-09-11T07:22:19Z","published":"2023-07-19T18:19:18Z","title":"Interpreting and Correcting Medical Image Classification with PIP-Net","summary":" Part-prototype models are explainable-by-design image classifiers, and a\npromising alternative to black box AI. This paper explores the applicability\nand potential of interpretable machine learning, in particular PIP-Net, for\nautomated diagnosis support on real-world medical imaging data. PIP-Net learns\nhuman-understandable prototypical image parts and we evaluate its accuracy and\ninterpretability for fracture detection and skin cancer diagnosis. We find that\nPIP-Net's decision making process is in line with medical classification\nstandards, while only provided with image-level class labels. Because of\nPIP-Net's unsupervised pretraining of prototypes, data quality problems such as\nundesired text in an X-ray or labelling errors can be easily identified.\nAdditionally, we are the first to show that humans can manually correct the\nreasoning of PIP-Net by directly disabling undesired prototypes. We conclude\nthat part-prototype models are promising for medical applications due to their\ninterpretability and potential for advanced model debugging.\n","authors":["Meike Nauta","Johannes H. Hegeman","Jeroen Geerdink","Jörg Schlötterer","Maurice van Keulen","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2307.10404v2.pdf","comment":"Accepted to the International Workshop on Explainable and\n Interpretable Machine Learning (XI-ML), co-located with ECAI 2023"},{"id":"http://arxiv.org/abs/2309.05276v1","updated":"2023-09-11T07:21:57Z","published":"2023-09-11T07:21:57Z","title":"Beamforming in Wireless Coded-Caching Systems","summary":" Increased capacity in the access network poses capacity challenges on the\ntransport network due to the aggregated traffic. However, there are spatial and\ntime correlation in the user data demands that could potentially be utilized.\nTo that end, we investigate a wireless transport network architecture that\nintegrates beamforming and coded-caching strategies. Especially, our proposed\ndesign entails a server with multiple antennas that broadcasts content to cache\nnodes responsible for serving users. Traditional caching methods face the\nlimitation of relying on the individual memory with additional overhead. Hence,\nwe develop an efficient genetic algorithm-based scheme for beam optimization in\nthe coded-caching system. By exploiting the advantages of beamforming and\ncoded-caching, the architecture achieves gains in terms of multicast\nopportunities, interference mitigation, and reduced peak backhaul traffic. A\ncomparative analysis of this joint design with traditional, un-coded caching\nschemes is also conducted to assess the benefits of the proposed approach.\nAdditionally, we examine the impact of various buffering and decoding methods\non the performance of the coded-caching scheme. Our findings suggest that\nproper beamforming is useful in enhancing the effectiveness of the\ncoded-caching technique, resulting in significant reduction in peak backhaul\ntraffic.\n","authors":["Sneha Madhusudan","Charitha Madapatha","Behrooz Makki","Hao Guo","Tommy Svensson"],"pdf_url":"https://arxiv.org/pdf/2309.05276v1.pdf","comment":"Submitted to IEEE Future Networks World Forum, 2023"},{"id":"http://arxiv.org/abs/2206.00979v3","updated":"2023-09-11T07:08:52Z","published":"2022-06-02T10:50:46Z","title":"Multi-scale Wasserstein Shortest-path Filtration Kernels on Graphs","summary":" The traditional shortest-path graph kernel (SP) is one of the most popular\ngraph kernels. It decomposes graphs into shortest paths and computes their\nfrequencies in each graph. However, SP has two main challenges: Firstly, the\ntriplet representation of the shortest path loses information. Secondly, SP\ncompares graphs without considering the multiple different scales of the graph\nstructure which is common in real-world graphs, e.g., the chain-, ring-, and\nstar-structures in social networks. To overcome these two challenges, we\ndevelop a novel shortest-path graph kernel called the Multi-scale Wasserstein\nShortest-Path Filtration graph kernel (MWSPF). It uses a BFS tree of a certain\ndepth rooted at each vertex to restrict the maximum length of the shortest path\nconsidering the small world property. It considers the labels of all the\nvertices in the shortest path. To facilitate the comparison of graphs at\nmultiple different scales, it augments graphs from both the aspects of the\nvertex and the graph structure. The distribution (frequency) of the shortest\npath changes across augmented graphs and the Wasserstein distance is employed\nto track the changes. We conduct experiments on various benchmark graph\ndatasets to evaluate MWSPF's performance. MWSPF is superior to the\nstate-of-the-art on most datasets.\n","authors":["Wei Ye","Hao Tian","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2206.00979v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2309.05270v1","updated":"2023-09-11T07:02:13Z","published":"2023-09-11T07:02:13Z","title":"CONFLATOR: Incorporating Switching Point based Rotatory Positional\n Encodings for Code-Mixed Language Modeling","summary":" The mixing of two or more languages is called Code-Mixing (CM). CM is a\nsocial norm in multilingual societies. Neural Language Models (NLMs) like\ntransformers have been very effective on many NLP tasks. However, NLM for CM is\nan under-explored area. Though transformers are capable and powerful, they\ncannot always encode positional/sequential information since they are\nnon-recurrent. Therefore, to enrich word information and incorporate positional\ninformation, positional encoding is defined. We hypothesize that Switching\nPoints (SPs), i.e., junctions in the text where the language switches (L1 -> L2\nor L2-> L1), pose a challenge for CM Language Models (LMs), and hence give\nspecial emphasis to switching points in the modeling process. We experiment\nwith several positional encoding mechanisms and show that rotatory positional\nencodings along with switching point information yield the best results.\n We introduce CONFLATOR: a neural language modeling approach for code-mixed\nlanguages. CONFLATOR tries to learn to emphasize switching points using smarter\npositional encoding, both at unigram and bigram levels. CONFLATOR outperforms\nthe state-of-the-art on two tasks based on code-mixed Hindi and English\n(Hinglish): (i) sentiment analysis and (ii) machine translation.\n","authors":["Mohsin Ali","Kandukuri Sai Teja","Neeharika Gupta","Parth Patwa","Anubhab Chatterjee","Vinija Jain","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2309.05270v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05269v1","updated":"2023-09-11T06:56:42Z","published":"2023-09-11T06:56:42Z","title":"UniKG: A Benchmark and Universal Embedding for Large-Scale Knowledge\n Graphs","summary":" Irregular data in real-world are usually organized as heterogeneous graphs\n(HGs) consisting of multiple types of nodes and edges. To explore useful\nknowledge from real-world data, both the large-scale encyclopedic HG datasets\nand corresponding effective learning methods are crucial, but haven't been well\ninvestigated. In this paper, we construct a large-scale HG benchmark dataset\nnamed UniKG from Wikidata to facilitate knowledge mining and heterogeneous\ngraph representation learning. Overall, UniKG contains more than 77 million\nmulti-attribute entities and 2000 diverse association types, which\nsignificantly surpasses the scale of existing HG datasets. To perform effective\nlearning on the large-scale UniKG, two key measures are taken, including (i)\nthe semantic alignment strategy for multi-attribute entities, which projects\nthe feature description of multi-attribute nodes into a common embedding space\nto facilitate node aggregation in a large receptive field; (ii) proposing a\nnovel plug-and-play anisotropy propagation module (APM) to learn effective\nmulti-hop anisotropy propagation kernels, which extends methods of large-scale\nhomogeneous graphs to heterogeneous graphs. These two strategies enable\nefficient information propagation among a tremendous number of multi-attribute\nentities and meantimes adaptively mine multi-attribute association through the\nmulti-hop aggregation in large-scale HGs. We set up a node classification task\non our UniKG dataset, and evaluate multiple baseline methods which are\nconstructed by embedding our APM into large-scale homogenous graph learning\nmethods. Our UniKG dataset and the baseline codes have been released at\nhttps://github.com/Yide-Qiu/UniKG.\n","authors":["Yide Qiu","Shaoxiang Ling","Tong Zhang","Bo Huang","Zhen Cui"],"pdf_url":"https://arxiv.org/pdf/2309.05269v1.pdf","comment":"9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.05260v1","updated":"2023-09-11T06:34:46Z","published":"2023-09-11T06:34:46Z","title":"Generalized Graphon Process: Convergence of Graph Frequencies in\n Stretched Cut Distance","summary":" Graphons have traditionally served as limit objects for dense graph\nsequences, with the cut distance serving as the metric for convergence.\nHowever, sparse graph sequences converge to the trivial graphon under the\nconventional definition of cut distance, which make this framework inadequate\nfor many practical applications. In this paper, we utilize the concepts of\ngeneralized graphons and stretched cut distance to describe the convergence of\nsparse graph sequences. Specifically, we consider a random graph process\ngenerated from a generalized graphon. This random graph process converges to\nthe generalized graphon in stretched cut distance. We use this random graph\nprocess to model the growing sparse graph, and prove the convergence of the\nadjacency matrices' eigenvalues. We supplement our findings with experimental\nvalidation. Our results indicate the possibility of transfer learning between\nsparse graphs.\n","authors":["Xingchao Jian","Feng Ji","Wee Peng Tay"],"pdf_url":"https://arxiv.org/pdf/2309.05260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05259v1","updated":"2023-09-11T06:31:45Z","published":"2023-09-11T06:31:45Z","title":"A physics-informed and attention-based graph learning approach for\n regional electric vehicle charging demand prediction","summary":" Along with the proliferation of electric vehicles (EVs), optimizing the use\nof EV charging space can significantly alleviate the growing load on\nintelligent transportation systems. As the foundation to achieve such an\noptimization, a spatiotemporal method for EV charging demand prediction in\nurban areas is required. Although several solutions have been proposed by using\ndata-driven deep learning methods, it can be found that these\nperformance-oriented methods may suffer from misinterpretations to correctly\nhandle the reverse relationship between charging demands and prices. To tackle\nthe emerging challenges of training an accurate and interpretable prediction\nmodel, this paper proposes a novel approach that enables the integration of\ngraph and temporal attention mechanisms for feature extraction and the usage of\nphysic-informed meta-learning in the model pre-training step for knowledge\ntransfer. Evaluation results on a dataset of 18,013 EV charging piles in\nShenzhen, China, show that the proposed approach, named PAG, can achieve\nstate-of-the-art forecasting performance and the ability in understanding the\nadaptive changes in charging demands caused by price fluctuations.\n","authors":["Haohao Qu","Haoxuan Kuang","Jun Li","Linlin You"],"pdf_url":"https://arxiv.org/pdf/2309.05259v1.pdf","comment":"Preprint. This work has been submitted to the IEEE Transactions on\n ITS for possible publication. Copyright may be transferred without notice,\n after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2309.05256v1","updated":"2023-09-11T06:26:57Z","published":"2023-09-11T06:26:57Z","title":"Examining the Effect of Pre-training on Time Series Classification","summary":" Although the pre-training followed by fine-tuning paradigm is used\nextensively in many fields, there is still some controversy surrounding the\nimpact of pre-training on the fine-tuning process. Currently, experimental\nfindings based on text and image data lack consensus. To delve deeper into the\nunsupervised pre-training followed by fine-tuning paradigm, we have extended\nprevious research to a new modality: time series. In this study, we conducted a\nthorough examination of 150 classification datasets derived from the Univariate\nTime Series (UTS) and Multivariate Time Series (MTS) benchmarks. Our analysis\nreveals several key conclusions. (i) Pre-training can only help improve the\noptimization process for models that fit the data poorly, rather than those\nthat fit the data well. (ii) Pre-training does not exhibit the effect of\nregularization when given sufficient training time. (iii) Pre-training can only\nspeed up convergence if the model has sufficient ability to fit the data. (iv)\nAdding more pre-training data does not improve generalization, but it can\nstrengthen the advantage of pre-training on the original data volume, such as\nfaster convergence. (v) While both the pre-training task and the model\nstructure determine the effectiveness of the paradigm on a given dataset, the\nmodel structure plays a more significant role.\n","authors":["Jiashu Pu","Shiwei Zhao","Ling Cheng","Yongzhu Chang","Runze Wu","Tangjie Lv","Rongsheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05253v1","updated":"2023-09-11T06:06:31Z","published":"2023-09-11T06:06:31Z","title":"A quantum tug of war between randomness and symmetries on homogeneous\n spaces","summary":" We explore the interplay between symmetry and randomness in quantum\ninformation. Adopting a geometric approach, we consider states as\n$H$-equivalent if related by a symmetry transformation characterized by the\ngroup $H$. We then introduce the Haar measure on the homogeneous space\n$\\mathbb{U}/H$, characterizing true randomness for $H$-equivalent systems.\nWhile this mathematical machinery is well-studied by mathematicians, it has\nseen limited application in quantum information: we believe our work to be the\nfirst instance of utilizing homogeneous spaces to characterize symmetry in\nquantum information. This is followed by a discussion of approximations of true\nrandomness, commencing with $t$-wise independent approximations and defining\n$t$-designs on $\\mathbb{U}/H$ and $H$-equivalent states. Transitioning further,\nwe explore pseudorandomness, defining pseudorandom unitaries and states within\nhomogeneous spaces. Finally, as a practical demonstration of our findings, we\nstudy the expressibility of quantum machine learning ansatze in homogeneous\nspaces. Our work provides a fresh perspective on the relationship between\nrandomness and symmetry in the quantum world.\n","authors":["Rahul Arvind","Kishor Bharti","Jun Yong Khoo","Dax Enshan Koh","Jian Feng Kong"],"pdf_url":"https://arxiv.org/pdf/2309.05253v1.pdf","comment":"9 + 1 pages, 3 figures"},{"id":"http://arxiv.org/abs/2308.09262v2","updated":"2023-09-11T06:04:58Z","published":"2023-08-18T02:36:21Z","title":"Multi-Task Pseudo-Label Learning for Non-Intrusive Speech Quality\n Assessment Model","summary":" This study proposes a multi-task pseudo-label learning (MPL)-based\nnon-intrusive speech quality assessment model called MTQ-Net. MPL consists of\ntwo stages: obtaining pseudo-label scores from a pretrained model and\nperforming multi-task learning. The 3QUEST metrics, namely Speech-MOS (S-MOS),\nNoise-MOS (N-MOS), and General-MOS (G-MOS), are the assessment targets. The\npretrained MOSA-Net model is utilized to estimate three pseudo labels:\nperceptual evaluation of speech quality (PESQ), short-time objective\nintelligibility (STOI), and speech distortion index (SDI). Multi-task learning\nis then employed to train MTQ-Net by combining a supervised loss (derived from\nthe difference between the estimated score and the ground-truth label) and a\nsemi-supervised loss (derived from the difference between the estimated score\nand the pseudo label), where the Huber loss is employed as the loss function.\nExperimental results first demonstrate the advantages of MPL compared to\ntraining a model from scratch and using a direct knowledge transfer mechanism.\nSecond, the benefit of the Huber loss for improving the predictive ability of\nMTQ-Net is verified. Finally, the MTQ-Net with the MPL approach exhibits higher\noverall predictive power compared to other SSL-based speech assessment models.\n","authors":["Ryandhimas E. Zezario","Bo-Ren Brian Bai","Chiou-Shann Fuh","Hsin-Min Wang","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2308.09262v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13032v2","updated":"2023-09-11T05:39:05Z","published":"2023-08-24T18:58:10Z","title":"Financial News Analytics Using Fine-Tuned Llama 2 GPT Model","summary":" The paper considers the possibility to fine-tune Llama 2 GPT large language\nmodel (LLM) for the multitask analysis of financial news. For fine-tuning, the\nPEFT/LoRA based approach was used. In the study, the model was fine-tuned for\nthe following tasks: analysing a text from financial market perspectives,\nhighlighting main points of a text, summarizing a text and extracting named\nentities with appropriate sentiments. The obtained results show that the\nfine-tuned Llama 2 model can perform a multitask financial news analysis with a\nspecified structure of response, part of response can be a structured text and\nanother part of data can have JSON format for further processing. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models with quantitative target variables.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2308.13032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14129v2","updated":"2023-09-11T05:30:49Z","published":"2023-08-27T15:11:44Z","title":"SPEED: Streaming Partition and Parallel Acceleration for Temporal\n Interaction Graph Embedding","summary":" Temporal Interaction Graphs (TIGs) are widely employed to model intricate\nreal-world systems such as financial systems and social networks. To capture\nthe dynamism and interdependencies of nodes, existing TIG embedding models need\nto process edges sequentially and chronologically. However, this requirement\nprevents it from being processed in parallel and struggle to accommodate\nburgeoning data volumes to GPU. Consequently, many large-scale temporal\ninteraction graphs are confined to CPU processing. Furthermore, a generalized\nGPU scaling and acceleration approach remains unavailable. To facilitate\nlarge-scale TIGs' implementation on GPUs for acceleration, we introduce a novel\ntraining approach namely Streaming Edge Partitioning and Parallel Acceleration\nfor Temporal Interaction Graph Embedding (SPEED). The SPEED is comprised of a\nStreaming Edge Partitioning Component (SEP) which addresses space overhead\nissue by assigning fewer nodes to each GPU, and a Parallel Acceleration\nComponent (PAC) which enables simultaneous training of different sub-graphs,\naddressing time overhead issue. Our method can achieve a good balance in\ncomputing resources, computing time, and downstream task performance. Empirical\nvalidation across 7 real-world datasets demonstrates the potential to expedite\ntraining speeds by a factor of up to 19.29x. Simultaneously, resource\nconsumption of a single-GPU can be diminished by up to 69%, thus enabling the\nmultiple GPU-based training and acceleration encompassing millions of nodes and\nbillions of edges. Furthermore, our approach also maintains its competitiveness\nin downstream tasks.\n","authors":["Xi Chen","Yongxiang Liao","Yun Xiong","Yao Zhang","Siwei Zhang","Jiawei Zhang","Yiheng Sun"],"pdf_url":"https://arxiv.org/pdf/2308.14129v2.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2205.08187v2","updated":"2023-09-11T05:07:11Z","published":"2022-05-17T09:14:32Z","title":"Deep neural networks with dependent weights: Gaussian Process mixture\n limit, heavy tails, sparsity and compressibility","summary":" This article studies the infinite-width limit of deep feedforward neural\nnetworks whose weights are dependent, and modelled via a mixture of Gaussian\ndistributions. Each hidden node of the network is assigned a nonnegative random\nvariable that controls the variance of the outgoing weights of that node. We\nmake minimal assumptions on these per-node random variables: they are iid and\ntheir sum, in each layer, converges to some finite random variable in the\ninfinite-width limit. Under this model, we show that each layer of the\ninfinite-width neural network can be characterised by two simple quantities: a\nnon-negative scalar parameter and a L\\'evy measure on the positive reals. If\nthe scalar parameters are strictly positive and the L\\'evy measures are trivial\nat all hidden layers, then one recovers the classical Gaussian process (GP)\nlimit, obtained with iid Gaussian weights. More interestingly, if the L\\'evy\nmeasure of at least one layer is non-trivial, we obtain a mixture of Gaussian\nprocesses (MoGP) in the large-width limit. The behaviour of the neural network\nin this regime is very different from the GP regime. One obtains correlated\noutputs, with non-Gaussian distributions, possibly with heavy tails.\nAdditionally, we show that, in this regime, the weights are compressible, and\nsome nodes have asymptotically non-negligible contributions, therefore\nrepresenting important hidden features. Many sparsity-promoting neural network\nmodels can be recast as special cases of our approach, and we discuss their\ninfinite-width limits; we also present an asymptotic analysis of the pruning\nerror. We illustrate some of the benefits of the MoGP regime over the GP regime\nin terms of representation learning and compressibility on simulated, MNIST and\nFashion MNIST datasets.\n","authors":["Hoil Lee","Fadhel Ayed","Paul Jung","Juho Lee","Hongseok Yang","François Caron"],"pdf_url":"https://arxiv.org/pdf/2205.08187v2.pdf","comment":"96 pages, 15 figures, 9 tables"},{"id":"http://arxiv.org/abs/2308.00994v2","updated":"2023-09-11T05:06:38Z","published":"2023-08-02T07:59:25Z","title":"SYNAuG: Exploiting Synthetic Data for Data Imbalance Problems","summary":" We live in an era of data floods, and deep neural networks play a pivotal\nrole in this moment. Natural data inherently exhibits several challenges such\nas long-tailed distribution and model fairness, where data imbalance is at the\ncenter of fundamental issues. This imbalance poses a risk of deep neural\nnetworks producing biased predictions, leading to potentially severe ethical\nand social problems. To address these problems, we leverage the recent\ngenerative models advanced in generating high-quality images. In this work, we\npropose SYNAuG, which utilizes synthetic data to uniformize the given imbalance\ndistribution followed by a simple post-calibration step considering the domain\ngap between real and synthetic data. This straightforward approach yields\nimpressive performance on datasets for distinctive data imbalance problems such\nas CIFAR100-LT, ImageNet100-LT, UTKFace, and Waterbirds, surpassing the\nperformance of existing task-specific methods. While we do not claim that our\napproach serves as a complete solution to the problem of data imbalance, we\nargue that supplementing the existing data with synthetic data proves to be an\neffective and crucial step in addressing data imbalance concerns.\n","authors":["Moon Ye-Bin","Nam Hyeon-Woo","Wonseok Choi","Nayeong Kim","Suha Kwak","Tae-Hyun Oh"],"pdf_url":"https://arxiv.org/pdf/2308.00994v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.05918v2","updated":"2023-09-11T04:22:30Z","published":"2022-10-12T04:37:54Z","title":"Finite time analysis of temporal difference learning with linear\n function approximation: Tail averaging and regularisation","summary":" We study the finite-time behaviour of the popular temporal difference (TD)\nlearning algorithm when combined with tail-averaging. We derive finite time\nbounds on the parameter error of the tail-averaged TD iterate under a step-size\nchoice that does not require information about the eigenvalues of the matrix\nunderlying the projected TD fixed point. Our analysis shows that tail-averaged\nTD converges at the optimal $O\\left(1/t\\right)$ rate, both in expectation and\nwith high probability. In addition, our bounds exhibit a sharper rate of decay\nfor the initial error (bias), which is an improvement over averaging all\niterates. We also propose and analyse a variant of TD that incorporates\nregularisation. From analysis, we conclude that the regularised version of TD\nis useful for problems with ill-conditioned features.\n","authors":["Gandharv Patil","Prashanth L. A.","Dheeraj Nagaraj","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2210.05918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13570v5","updated":"2023-09-11T04:10:09Z","published":"2023-08-25T05:52:41Z","title":"Stochastic Configuration Machines for Industrial Artificial Intelligence","summary":" Real-time predictive modelling with desired accuracy is highly expected in\nindustrial artificial intelligence (IAI), where neural networks play a key\nrole. Neural networks in IAI require powerful, high-performance computing\ndevices to operate a large number of floating point data. Based on stochastic\nconfiguration networks (SCNs), this paper proposes a new randomized learner\nmodel, termed stochastic configuration machines (SCMs), to stress effective\nmodelling and data size saving that are useful and valuable for industrial\napplications. Compared to SCNs and random vector functional-link (RVFL) nets\nwith binarized implementation, the model storage of SCMs can be significantly\ncompressed while retaining favourable prediction performance. Besides the\narchitecture of the SCM learner model and its learning algorithm, as an\nimportant part of this contribution, we also provide a theoretical basis on the\nlearning capacity of SCMs by analysing the model's complexity. Experimental\nstudies are carried out over some benchmark datasets and three industrial\napplications. The results demonstrate that SCM has great potential for dealing\nwith industrial data analytics.\n","authors":["Dianhui Wang","Matthew J. Felicetti"],"pdf_url":"https://arxiv.org/pdf/2308.13570v5.pdf","comment":"23 pages, 7 figures, 12 tables"},{"id":"http://arxiv.org/abs/2309.05224v1","updated":"2023-09-11T04:03:43Z","published":"2023-09-11T04:03:43Z","title":"SparseSwin: Swin Transformer with Sparse Transformer Block","summary":" Advancements in computer vision research have put transformer architecture as\nthe state of the art in computer vision tasks. One of the known drawbacks of\nthe transformer architecture is the high number of parameters, this can lead to\na more complex and inefficient algorithm. This paper aims to reduce the number\nof parameters and in turn, made the transformer more efficient. We present\nSparse Transformer (SparTa) Block, a modified transformer block with an\naddition of a sparse token converter that reduces the number of tokens used. We\nuse the SparTa Block inside the Swin T architecture (SparseSwin) to leverage\nSwin capability to downsample its input and reduce the number of initial tokens\nto be calculated. The proposed SparseSwin model outperforms other state of the\nart models in image classification with an accuracy of 86.96%, 97.43%, and\n85.35% on the ImageNet100, CIFAR10, and CIFAR100 datasets respectively. Despite\nits fewer parameters, the result highlights the potential of a transformer\narchitecture using a sparse token converter with a limited number of tokens to\noptimize the use of the transformer and improve its performance.\n","authors":["Krisna Pinasthika","Blessius Sheldo Putra Laksono","Riyandi Banovbi Putera Irsal","Syifa Hukma Shabiyya","Novanto Yudistira"],"pdf_url":"https://arxiv.org/pdf/2309.05224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05213v1","updated":"2023-09-11T03:17:45Z","published":"2023-09-11T03:17:45Z","title":"Towards Federated Learning Under Resource Constraints via Layer-wise\n Training and Depth Dropout","summary":" Large machine learning models trained on diverse data have recently seen\nunprecedented success. Federated learning enables training on private data that\nmay otherwise be inaccessible, such as domain-specific datasets decentralized\nacross many clients. However, federated learning can be difficult to scale to\nlarge models when clients have limited resources. This challenge often results\nin a trade-off between model size and access to diverse data. To mitigate this\nissue and facilitate training of large models on edge devices, we introduce a\nsimple yet effective strategy, Federated Layer-wise Learning, to simultaneously\nreduce per-client memory, computation, and communication costs. Clients train\njust a single layer each round, reducing resource costs considerably with\nminimal performance degradation. We also introduce Federated Depth Dropout, a\ncomplementary technique that randomly drops frozen layers during training, to\nfurther reduce resource usage. Coupling these two techniques enables us to\neffectively train significantly larger models on edge devices. Specifically, we\nreduce training memory usage by 5x or more in federated self-supervised\nrepresentation learning and demonstrate that performance in downstream tasks is\ncomparable to conventional federated self-supervised learning.\n","authors":["Pengfei Guo","Warren Richard Morningstar","Raviteja Vemulapalli","Karan Singhal","Vishal M. Patel","Philip Andrew Mansfield"],"pdf_url":"https://arxiv.org/pdf/2309.05213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05202v1","updated":"2023-09-11T02:35:22Z","published":"2023-09-11T02:35:22Z","title":"Graph Contextual Contrasting for Multivariate Time Series Classification","summary":" Contrastive learning, as a self-supervised learning paradigm, becomes popular\nfor Multivariate Time-Series (MTS) classification. It ensures the consistency\nacross different views of unlabeled samples and then learns effective\nrepresentations for these samples. Existing contrastive learning methods mainly\nfocus on achieving temporal consistency with temporal augmentation and\ncontrasting techniques, aiming to preserve temporal patterns against\nperturbations for MTS data. However, they overlook spatial consistency that\nrequires the stability of individual sensors and their correlations. As MTS\ndata typically originate from multiple sensors, ensuring spatial consistency\nbecomes essential for the overall performance of contrastive learning on MTS\ndata. Thus, we propose Graph Contextual Contrasting (GCC) for spatial\nconsistency across MTS data. Specifically, we propose graph augmentations\nincluding node and edge augmentations to preserve the stability of sensors and\ntheir correlations, followed by graph contrasting with both node- and\ngraph-level contrasting to extract robust sensor- and global-level features. We\nfurther introduce multi-window temporal contrasting to ensure temporal\nconsistency in the data for each sensor. Extensive experiments demonstrate that\nour proposed GCC achieves state-of-the-art performance on various MTS\nclassification tasks.\n","authors":["Yucheng Wang","Yuecong Xu","Jianfei Yang","Min Wu","Xiaoli Li","Lihua Xie","Zhenghua Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05202v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.05200v1","updated":"2023-09-11T02:30:06Z","published":"2023-09-11T02:30:06Z","title":"CARE: Confidence-rich Autonomous Robot Exploration using Bayesian Kernel\n Inference and Optimization","summary":" In this paper, we consider improving the efficiency of information-based\nautonomous robot exploration in unknown and complex environments. We first\nutilize Gaussian process (GP) regression to learn a surrogate model to infer\nthe confidence-rich mutual information (CRMI) of querying control actions, then\nadopt an objective function consisting of predicted CRMI values and prediction\nuncertainties to conduct Bayesian optimization (BO), i.e., GP-based BO (GPBO).\nThe trade-off between the best action with the highest CRMI value\n(exploitation) and the action with high prediction variance (exploration) can\nbe realized. To further improve the efficiency of GPBO, we propose a novel\nlightweight information gain inference method based on Bayesian kernel\ninference and optimization (BKIO), achieving an approximate logarithmic\ncomplexity without the need for training. BKIO can also infer the CRMI and\ngenerate the best action using BO with bounded cumulative regret, which ensures\nits comparable accuracy to GPBO with much higher efficiency. Extensive\nnumerical and real-world experiments show the desired efficiency of our\nproposed methods without losing exploration performance in different\nunstructured, cluttered environments. We also provide our open-source\nimplementation code at https://github.com/Shepherd-Gregory/BKIO-Exploration.\n","authors":["Yang Xu","Ronghao Zheng","Senlin Zhang","Meiqin Liu","Shoudong Huang"],"pdf_url":"https://arxiv.org/pdf/2309.05200v1.pdf","comment":"Full version for the paper accepted by IEEE Robotics and Automation\n Letters (RA-L) 2023. arXiv admin note: text overlap with arXiv:2301.00523"},{"id":"http://arxiv.org/abs/2309.05196v1","updated":"2023-09-11T02:16:47Z","published":"2023-09-11T02:16:47Z","title":"Does Writing with Language Models Reduce Content Diversity?","summary":" Large language models (LLMs) have led to a surge in collaborative writing\nwith model assistance. As different users incorporate suggestions from the same\nmodel, there is a risk of decreased diversity in the produced content,\npotentially limiting diverse perspectives in public discourse. In this work, we\nmeasure the impact of co-writing on diversity via a controlled experiment,\nwhere users write argumentative essays in three setups -- using a base LLM\n(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We\ndevelop a set of diversity metrics and find that writing with InstructGPT (but\nnot the GPT3) results in a statistically significant reduction in diversity.\nSpecifically, it increases the similarity between the writings of different\nauthors and reduces the overall lexical and content diversity. We additionally\nfind that this effect is mainly attributable to InstructGPT contributing less\ndiverse text to co-written essays. In contrast, the user-contributed text\nremains unaffected by model collaboration. This suggests that the recent\nimprovement in generation quality from adapting models to human feedback might\ncome at the cost of more homogeneous and less diverse content.\n","authors":["Vishakh Padmakumar","He He"],"pdf_url":"https://arxiv.org/pdf/2309.05196v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2107.08195v5","updated":"2023-09-11T01:54:39Z","published":"2021-07-17T06:55:28Z","title":"Complexity-Optimized Sparse Bayesian Learning for Scalable\n Classification Tasks","summary":" Sparse Bayesian Learning (SBL) constructs an extremely sparse probabilistic\nmodel with very competitive generalization. However, SBL needs to invert a big\ncovariance matrix with complexity $O(M^3)$ (M: feature size) for updating the\nregularization priors, making it difficult for problems with high dimensional\nfeature space or large data size. As it may easily suffer from the memory\noverflow issue in such problems. This paper addresses this issue with a newly\nproposed diagonal Quasi-Newton (DQN) method for SBL called DQN-SBL where the\ninversion of big covariance matrix is ignored so that the complexity is reduced\nto $O(M)$. The DQN-SBL is thoroughly evaluated for non linear and linear\nclassifications with various benchmarks of different sizes. Experimental\nresults verify that DQN-SBL receives competitive generalization with a very\nsparse model and scales well to large-scale problems.\n","authors":["Jiahua Luo","Chi-Man Wong","Chi-Man Vong"],"pdf_url":"https://arxiv.org/pdf/2107.08195v5.pdf","comment":"12 pages,5 figures"},{"id":"http://arxiv.org/abs/2306.08853v2","updated":"2023-09-11T01:27:14Z","published":"2023-06-15T04:42:25Z","title":"In Search of netUnicorn: A Data-Collection Platform to Develop\n Generalizable ML Models for Network Security Problems","summary":" The remarkable success of the use of machine learning-based solutions for\nnetwork security problems has been impeded by the developed ML models'\ninability to maintain efficacy when used in different network environments\nexhibiting different network behaviors. This issue is commonly referred to as\nthe generalizability problem of ML models. The community has recognized the\ncritical role that training datasets play in this context and has developed\nvarious techniques to improve dataset curation to overcome this problem.\nUnfortunately, these methods are generally ill-suited or even counterproductive\nin the network security domain, where they often result in unrealistic or\npoor-quality datasets.\n To address this issue, we propose an augmented ML pipeline that leverages\nexplainable ML tools to guide the network data collection in an iterative\nfashion. To ensure the data's realism and quality, we require that the new\ndatasets should be endogenously collected in this iterative process, thus\nadvocating for a gradual removal of data-related problems to improve model\ngeneralizability. To realize this capability, we develop a data-collection\nplatform, netUnicorn, that takes inspiration from the classic \"hourglass\" model\nand is implemented as its \"thin waist\" to simplify data collection for\ndifferent learning problems from diverse network environments. The proposed\nsystem decouples data-collection intents from the deployment mechanisms and\ndisaggregates these high-level intents into smaller reusable, self-contained\ntasks.\n We demonstrate how netUnicorn simplifies collecting data for different\nlearning problems from multiple network environments and how the proposed\niterative data collection improves a model's generalizability.\n","authors":["Roman Beltiukov","Wenbo Guo","Arpit Gupta","Walter Willinger"],"pdf_url":"https://arxiv.org/pdf/2306.08853v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05183v1","updated":"2023-09-11T01:00:10Z","published":"2023-09-11T01:00:10Z","title":"Data Summarization beyond Monotonicity: Non-monotone Two-Stage\n Submodular Maximization","summary":" The objective of a two-stage submodular maximization problem is to reduce the\nground set using provided training functions that are submodular, with the aim\nof ensuring that optimizing new objective functions over the reduced ground set\nyields results comparable to those obtained over the original ground set. This\nproblem has applications in various domains including data summarization.\nExisting studies often assume the monotonicity of the objective function,\nwhereas our work pioneers the extension of this research to accommodate\nnon-monotone submodular functions. We have introduced the first constant-factor\napproximation algorithms for this more general case.\n","authors":["Shaojie Tang"],"pdf_url":"https://arxiv.org/pdf/2309.05183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13451v2","updated":"2023-09-11T01:00:03Z","published":"2023-08-25T15:53:30Z","title":"Gotta match 'em all: Solution diversification in graph matching matched\n filters","summary":" We present a novel approach for finding multiple noisily embedded template\ngraphs in a very large background graph. Our method builds upon the\ngraph-matching-matched-filter technique proposed in Sussman et al., with the\ndiscovery of multiple diverse matchings being achieved by iteratively\npenalizing a suitable node-pair similarity matrix in the matched filter\nalgorithm. In addition, we propose algorithmic speed-ups that greatly enhance\nthe scalability of our matched-filter approach. We present theoretical\njustification of our methodology in the setting of correlated Erdos-Renyi\ngraphs, showing its ability to sequentially discover multiple templates under\nmild model conditions. We additionally demonstrate our method's utility via\nextensive experiments both using simulated models and real-world dataset,\ninclude human brain connectomes and a large transactional knowledge base.\n","authors":["Zhirui Li","Ben Johnson","Daniel L. Sussman","Carey E. Priebe","Vince Lyzinski"],"pdf_url":"https://arxiv.org/pdf/2308.13451v2.pdf","comment":"36 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.05173v1","updated":"2023-09-11T00:02:05Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":" Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving over 20% memory and time costs compared to vanilla PT\nand its variants, without changing trainable parameter sizes. Through extensive\nexperiments on 23 natural language processing (NLP) and vision-language (VL)\ntasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches,\nincluding the full fine-tuning baseline in some scenarios. Additionally, we\nempirically show that DEPT grows more efficient as the model size increases.\nOur further study reveals that DePT integrates seamlessly with\nparameter-efficient transfer learning in the few-shot learning setting and\nhighlights its adaptability to various model architectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v1.pdf","comment":"Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2304.01395v2","updated":"2023-09-11T00:00:06Z","published":"2023-04-03T22:06:49Z","title":"Learning Personalized Models with Clustered System Identification","summary":" We address the problem of learning linear system models from observing\nmultiple trajectories from different system dynamics. This framework\nencompasses a collaborative scenario where several systems seeking to estimate\ntheir dynamics are partitioned into clusters according to their system\nsimilarity. Thus, the systems within the same cluster can benefit from the\nobservations made by the others. Considering this framework, we present an\nalgorithm where each system alternately estimates its cluster identity and\nperforms an estimation of its dynamics. This is then aggregated to update the\nmodel of each cluster. We show that under mild assumptions, our algorithm\ncorrectly estimates the cluster identities and achieves an approximate sample\ncomplexity that scales inversely with the number of systems in the cluster,\nthus facilitating a more efficient and personalized system identification\nprocess.\n","authors":["Leonardo F. Toso","Han Wang","James Anderson"],"pdf_url":"https://arxiv.org/pdf/2304.01395v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05878v1","updated":"2023-09-11T23:59:18Z","published":"2023-09-11T23:59:18Z","title":"Reaction coordinate flows for model reduction of molecular kinetics","summary":" In this work, we introduce a flow based machine learning approach, called\nreaction coordinate (RC) flow, for discovery of low-dimensional kinetic models\nof molecular systems. The RC flow utilizes a normalizing flow to design the\ncoordinate transformation and a Brownian dynamics model to approximate the\nkinetics of RC, where all model parameters can be estimated in a data-driven\nmanner. In contrast to existing model reduction methods for molecular kinetics,\nRC flow offers a trainable and tractable model of reduced kinetics in\ncontinuous time and space due to the invertibility of the normalizing flow.\nFurthermore, the Brownian dynamics-based reduced kinetic model investigated in\nthis work yields a readily discernible representation of metastable states\nwithin the phase space of the molecular system. Numerical experiments\ndemonstrate how effectively the proposed method discovers interpretable and\naccurate low-dimensional representations of given full-state kinetics from\nsimulations.\n","authors":["Hao Wu","Frank Noé"],"pdf_url":"https://arxiv.org/pdf/2309.05878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.00905v3","updated":"2023-09-11T23:45:41Z","published":"2022-09-02T09:27:37Z","title":"From latent dynamics to meaningful representations","summary":" While representation learning has been central to the rise of machine\nlearning and artificial intelligence, a key problem remains in making the\nlearnt representations meaningful. For this the typical approach is to\nregularize the learned representation through prior probability distributions.\nHowever such priors are usually unavailable or ad hoc. To deal with this, we\npropose a dynamics-constrained representation learning framework. Instead of\nusing predefined probabilities, we restrict the latent representation to follow\nspecific dynamics, which is a more natural constraint for representation\nlearning in dynamical systems. Our belief stems from a fundamental observation\nin physics that though different systems can have different marginalized\nprobability distributions, they typically obey the same dynamics, such as\nNewton's and Schrodinger's equations. We validate our framework for different\nsystems including a real-world fluorescent DNA movie dataset. We show that our\nalgorithm can uniquely identify an uncorrelated, isometric and meaningful\nlatent representation.\n","authors":["Dedi Wang","Yihang Wang","Luke Evans","Pratyush Tiwary"],"pdf_url":"https://arxiv.org/pdf/2209.00905v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13409v3","updated":"2023-09-11T23:24:51Z","published":"2023-05-22T18:49:52Z","title":"Efficient Learning of Quantum States Prepared With Few Non-Clifford\n Gates","summary":" We give an algorithm that efficiently learns a quantum state prepared by\nClifford gates and $O(\\log(n))$ non-Clifford gates. Specifically, for an\n$n$-qubit state $\\lvert \\psi \\rangle$ prepared with at most $t$ non-Clifford\ngates, we show that $\\mathsf{poly}(n,2^t,1/\\epsilon)$ time and copies of\n$\\lvert \\psi \\rangle$ suffice to learn $\\lvert \\psi \\rangle$ to trace distance\nat most $\\epsilon$. This result follows as a special case of an algorithm for\nlearning states with large stabilizer dimension, where a quantum state has\nstabilizer dimension $k$ if it is stabilized by an abelian group of $2^k$ Pauli\noperators. We also develop an efficient property testing algorithm for\nstabilizer dimension, which may be of independent interest.\n","authors":["Sabee Grewal","Vishnu Iyer","William Kretschmer","Daniel Liang"],"pdf_url":"https://arxiv.org/pdf/2305.13409v3.pdf","comment":"25 pages. V3: Fixed typos"},{"id":"http://arxiv.org/abs/2309.05865v1","updated":"2023-09-11T23:08:03Z","published":"2023-09-11T23:08:03Z","title":"Force-directed graph embedding with hops distance","summary":" Graph embedding has become an increasingly important technique for analyzing\ngraph-structured data. By representing nodes in a graph as vectors in a\nlow-dimensional space, graph embedding enables efficient graph processing and\nanalysis tasks like node classification, link prediction, and visualization. In\nthis paper, we propose a novel force-directed graph embedding method that\nutilizes the steady acceleration kinetic formula to embed nodes in a way that\npreserves graph topology and structural features. Our method simulates a set of\ncustomized attractive and repulsive forces between all node pairs with respect\nto their hop distance. These forces are then used in Newton's second law to\nobtain the acceleration of each node. The method is intuitive, parallelizable,\nand highly scalable. We evaluate our method on several graph analysis tasks and\nshow that it achieves competitive performance compared to state-of-the-art\nunsupervised embedding techniques.\n","authors":["Hamidreza Lotfalizadeh","Mohammad Al Hasan"],"pdf_url":"https://arxiv.org/pdf/2309.05865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05863v1","updated":"2023-09-11T23:02:56Z","published":"2023-09-11T23:02:56Z","title":"The bionic neural network for external simulation of human locomotor\n system","summary":" Muscle forces and joint kinematics estimated with musculoskeletal (MSK)\nmodeling techniques offer useful metrics describing movement quality.\nModel-based computational MSK models can interpret the dynamic interaction\nbetween the neural drive to muscles, muscle dynamics, body and joint\nkinematics, and kinetics. Still, such a set of solutions suffers from high\ncomputational time and muscle recruitment problems, especially in complex\nmodeling. In recent years, data-driven methods have emerged as a promising\nalternative due to the benefits of flexibility and adaptability. However, a\nlarge amount of labeled training data is not easy to be acquired. This paper\nproposes a physics-informed deep learning method based on MSK modeling to\npredict joint motion and muscle forces. The MSK model is embedded into the\nneural network as an ordinary differential equation (ODE) loss function with\nphysiological parameters of muscle activation dynamics and muscle contraction\ndynamics to be identified. These parameters are automatically estimated during\nthe training process which guides the prediction of muscle forces combined with\nthe MSK forward dynamics model. Experimental validations on two groups of data,\nincluding one benchmark dataset and one self-collected dataset from six healthy\nsubjects, are performed. The results demonstrate that the proposed deep\nlearning method can effectively identify subject-specific MSK physiological\nparameters and the trained physics-informed forward-dynamics surrogate yields\naccurate motion and muscle forces predictions.\n","authors":["Yue Shi","Shuhao Ma","Yihui Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05863v1.pdf","comment":"10"},{"id":"http://arxiv.org/abs/2308.09437v2","updated":"2023-09-11T22:49:08Z","published":"2023-08-18T10:07:46Z","title":"From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the\n Right Reasons in Latent Space","summary":" Deep Neural Networks are prone to learning spurious correlations embedded in\nthe training data, leading to potentially biased predictions. This poses risks\nwhen deploying these models for high-stake decision-making, such as in medical\napplications. Current methods for post-hoc model correction either require\ninput-level annotations, which are only possible for spatially localized\nbiases, or augment the latent feature space, thereby hoping to enforce the\nright reasons. We present a novel method ensuring the right reasons on the\nconcept level by reducing the model's sensitivity towards biases through the\ngradient. When modeling biases via Concept Activation Vectors, we highlight the\nimportance of choosing robust directions, as traditional regression-based\napproaches such as Support Vector Machines tend to result in diverging\ndirections. We effectively mitigate biases in controlled and real-world\nsettings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet\nand EfficientNet architectures.\n","authors":["Maximilian Dreyer","Frederik Pahde","Christopher J. Anders","Wojciech Samek","Sebastian Lapuschkin"],"pdf_url":"https://arxiv.org/pdf/2308.09437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05858v1","updated":"2023-09-11T22:42:50Z","published":"2023-09-11T22:42:50Z","title":"Uncovering mesa-optimization algorithms in Transformers","summary":" Transformers have become the dominant model in deep learning, but the reason\nfor their superior performance is poorly understood. Here, we hypothesize that\nthe strong performance of Transformers stems from an architectural bias towards\nmesa-optimization, a learned process running within the forward pass of a model\nconsisting of the following two steps: (i) the construction of an internal\nlearning objective, and (ii) its corresponding solution found through\noptimization. To test this hypothesis, we reverse-engineer a series of\nautoregressive Transformers trained on simple sequence modeling tasks,\nuncovering underlying gradient-based mesa-optimization algorithms driving the\ngeneration of predictions. Moreover, we show that the learned forward-pass\noptimization algorithm can be immediately repurposed to solve supervised\nfew-shot tasks, suggesting that mesa-optimization might underlie the in-context\nlearning capabilities of large language models. Finally, we propose a novel\nself-attention layer, the mesa-layer, that explicitly and efficiently solves\noptimization problems specified in context. We find that this layer can lead to\nimproved performance in synthetic and preliminary language modeling\nexperiments, adding weight to our hypothesis that mesa-optimization is an\nimportant operation hidden within the weights of trained Transformers.\n","authors":["Johannes von Oswald","Eyvind Niklasson","Maximilian Schlegel","Seijin Kobayashi","Nicolas Zucchet","Nino Scherrer","Nolan Miller","Mark Sandler","Blaise Agüera y Arcas","Max Vladymyrov","Razvan Pascanu","João Sacramento"],"pdf_url":"https://arxiv.org/pdf/2309.05858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05855v1","updated":"2023-09-11T22:34:06Z","published":"2023-09-11T22:34:06Z","title":"Energy Preservation and Stability of Random Filterbanks","summary":" What makes waveform-based deep learning so hard? Despite numerous attempts at\ntraining convolutional neural networks (convnets) for filterbank design, they\noften fail to outperform hand-crafted baselines. This is all the more\nsurprising because these baselines are linear time-invariant systems: as such,\ntheir transfer functions could be accurately represented by a convnet with a\nlarge receptive field. In this article, we elaborate on the statistical\nproperties of simple convnets from the mathematical perspective of random\nconvolutional operators. We find that FIR filterbanks with random Gaussian\nweights are ill-conditioned for large filters and locally periodic input\nsignals, which both are typical in audio signal processing applications.\nFurthermore, we observe that expected energy preservation of a random\nfilterbank is not sufficient for numerical stability and derive theoretical\nbounds for its expected frame bounds.\n","authors":["Daniel Haider","Vincent Lostanlen","Martin Ehler","Peter Balazs"],"pdf_url":"https://arxiv.org/pdf/2309.05855v1.pdf","comment":"4 pages, 5 figures, 1 page appendix"},{"id":"http://arxiv.org/abs/2309.05853v1","updated":"2023-09-11T22:28:36Z","published":"2023-09-11T22:28:36Z","title":"ChemSpaceAL: An Efficient Active Learning Methodology Applied to\n Protein-Specific Molecular Generation","summary":" The incredible capabilities of generative artificial intelligence models have\ninevitably led to their application in the domain of drug discovery. It is\ntherefore of tremendous interest to develop methodologies that enhance the\nabilities and applicability of these powerful tools. In this work, we present a\nnovel and efficient semi-supervised active learning methodology that allows for\nthe fine-tuning of a generative model with respect to an objective function by\nstrategically operating within a constructed representation of the sample\nspace. In the context of targeted molecular generation, we demonstrate the\nability to fine-tune a GPT-based molecular generator with respect to an\nattractive interaction-based scoring function by strategically operating within\na chemical space proxy, thereby maximizing attractive interactions between the\ngenerated molecules and a protein target. Importantly, our approach does not\nrequire the individual evaluation of all data points that are used for\nfine-tuning, enabling the incorporation of computationally expensive metrics.\nWe are hopeful that the inherent generality of this methodology ensures that it\nwill remain applicable as this exciting field evolves. To facilitate\nimplementation and reproducibility, we have made all of our software available\nthrough the open-source ChemSpaceAL Python package.\n","authors":["Gregory W. Kyro","Anton Morgunov","Rafael I. Brent","Victor S. Batista"],"pdf_url":"https://arxiv.org/pdf/2309.05853v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05845v1","updated":"2023-09-11T22:08:09Z","published":"2023-09-11T22:08:09Z","title":"Effective Abnormal Activity Detection on Multivariate Time Series\n Healthcare Data","summary":" Multivariate time series (MTS) data collected from multiple sensors provide\nthe potential for accurate abnormal activity detection in smart healthcare\nscenarios. However, anomalies exhibit diverse patterns and become unnoticeable\nin MTS data. Consequently, achieving accurate anomaly detection is challenging\nsince we have to capture both temporal dependencies of time series and\ninter-relationships among variables. To address this problem, we propose a\nResidual-based Anomaly Detection approach, Rs-AD, for effective representation\nlearning and abnormal activity detection. We evaluate our scheme on a\nreal-world gait dataset and the experimental results demonstrate an F1 score of\n0.839.\n","authors":["Mengjia Niu","Yuchen Zhao","Hamed Haddadi"],"pdf_url":"https://arxiv.org/pdf/2309.05845v1.pdf","comment":"Poster accepted by the 29th Annual International Conference On Mobile\n Computing And Networking (ACM MobiCom 2023)"},{"id":"http://arxiv.org/abs/2309.05843v1","updated":"2023-09-11T22:03:34Z","published":"2023-09-11T22:03:34Z","title":"Optimizing Audio Augmentations for Contrastive Learning of\n Health-Related Acoustic Signals","summary":" Health-related acoustic signals, such as cough and breathing sounds, are\nrelevant for medical diagnosis and continuous health monitoring. Most existing\nmachine learning approaches for health acoustics are trained and evaluated on\nspecific tasks, limiting their generalizability across various healthcare\napplications. In this paper, we leverage a self-supervised learning framework,\nSimCLR with a Slowfast NFNet backbone, for contrastive learning of health\nacoustics. A crucial aspect of optimizing Slowfast NFNet for this application\nlies in identifying effective audio augmentations. We conduct an in-depth\nanalysis of various audio augmentation strategies and demonstrate that an\nappropriate augmentation strategy enhances the performance of the Slowfast\nNFNet audio encoder across a diverse set of health acoustic tasks. Our findings\nreveal that when augmentations are combined, they can produce synergistic\neffects that exceed the benefits seen when each is applied individually.\n","authors":["Louis Blankemeier","Sebastien Baur","Wei-Hung Weng","Jake Garrison","Yossi Matias","Shruthi Prabhakara","Diego Ardila","Zaid Nabulsi"],"pdf_url":"https://arxiv.org/pdf/2309.05843v1.pdf","comment":"7 pages, 2 pages appendix, 2 figures, 5 appendix tables"},{"id":"http://arxiv.org/abs/2309.05837v1","updated":"2023-09-11T21:34:16Z","published":"2023-09-11T21:34:16Z","title":"The Safety Filter: A Unified View of Safety-Critical Control in\n Autonomous Systems","summary":" Recent years have seen significant progress in the realm of robot autonomy,\naccompanied by the expanding reach of robotic technologies. However, the\nemergence of new deployment domains brings unprecedented challenges in ensuring\nsafe operation of these systems, which remains as crucial as ever. While\ntraditional model-based safe control methods struggle with generalizability and\nscalability, emerging data-driven approaches tend to lack well-understood\nguarantees, which can result in unpredictable catastrophic failures. Successful\ndeployment of the next generation of autonomous robots will require integrating\nthe strengths of both paradigms. This article provides a review of safety\nfilter approaches, highlighting important connections between existing\ntechniques and proposing a unified technical framework to understand, compare,\nand combine them. The new unified view exposes a shared modular structure\nacross a range of seemingly disparate safety filter classes and naturally\nsuggests directions for future progress towards more scalable synthesis, robust\nmonitoring, and efficient intervention.\n","authors":["Kai-Chieh Hsu","Haimin Hu","Jaime Fernández Fisac"],"pdf_url":"https://arxiv.org/pdf/2309.05837v1.pdf","comment":"Accepted for publication in Annual Review of Control, Robotics, and\n Autonomous Systems"},{"id":"http://arxiv.org/abs/2309.05833v1","updated":"2023-09-11T21:24:00Z","published":"2023-09-11T21:24:00Z","title":"PACE: Prompting and Augmentation for Calibrated Confidence Estimation\n with GPT-4 in Cloud Incident Root Cause Analysis","summary":" In recent years, the transition to cloud-based platforms in the IT sector has\nemphasized the significance of cloud incident root cause analysis to ensure\nservice reliability and maintain customer trust. Central to this process is the\nefficient determination of root causes, a task made challenging due to the\ncomplex nature of contemporary cloud infrastructures. Despite the proliferation\nof AI-driven tools for root cause identification, their applicability remains\nlimited by the inconsistent quality of their outputs. This paper introduces a\nmethod for enhancing confidence estimation in root cause analysis tools by\nprompting retrieval-augmented large language models (LLMs). This approach\noperates in two phases. Initially, the model evaluates its confidence based on\nhistorical incident data, considering its assessment of the evidence strength.\nSubsequently, the model reviews the root cause generated by the predictor. An\noptimization step then combines these evaluations to determine the final\nconfidence assignment. Experimental results illustrate that our method enables\nthe model to articulate its confidence effectively, providing a more calibrated\nscore. We address research questions evaluating the ability of our method to\nproduce calibrated confidence scores using LLMs, the impact of domain-specific\nretrieved examples on confidence estimates, and its potential generalizability\nacross various root cause analysis models. Through this, we aim to bridge the\nconfidence estimation gap, aiding on-call engineers in decision-making and\nbolstering the efficiency of cloud incident management.\n","authors":["Dylan Zhang","Xuchao Zhang","Chetan Bansal","Pedro Las-Casas","Rodrigo Fonseca","Saravan Rajmohan"],"pdf_url":"https://arxiv.org/pdf/2309.05833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05832v1","updated":"2023-09-11T21:18:15Z","published":"2023-09-11T21:18:15Z","title":"Instance-Agnostic Geometry and Contact Dynamics Learning","summary":" This work presents an instance-agnostic learning framework that fuses vision\nwith dynamics to simultaneously learn shape, pose trajectories and physical\nproperties via the use of geometry as a shared representation. Unlike many\ncontact learning approaches that assume motion capture input and a known shape\nprior for the collision model, our proposed framework learns an object's\ngeometric and dynamic properties from RGBD video, without requiring either\ncategory-level or instance-level shape priors. We integrate a vision system,\nBundleSDF, with a dynamics system, ContactNets and propose a cyclic training\npipeline to use the output from the dynamics module to refine the poses and the\ngeometry from the vision module, using perspective reprojection. Experiments\ndemonstrate our framework's ability to learn the geometry and dynamics of rigid\nand convex objects and improve upon the current tracking framework.\n","authors":["Mengti Sun","Bowen Jiang","Bibit Bianchini","Camillo Jose Taylor","Michael Posa"],"pdf_url":"https://arxiv.org/pdf/2309.05832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05831v1","updated":"2023-09-11T21:17:10Z","published":"2023-09-11T21:17:10Z","title":"Studying Accuracy of Machine Learning Models Trained on Lab Lifting Data\n in Solving Real-World Problems Using Wearable Sensors for Workplace Safety","summary":" Porting ML models trained on lab data to real-world situations has long been\na challenge. This paper discusses porting a lab-trained lifting identification\nmodel to the real-world. With performance much lower than on training data, we\nexplored causes of the failure and proposed four potential solutions to\nincrease model performance\n","authors":["Joseph Bertrand","Nick Griffey","Ming-Lun Lu","Rashmi Jha"],"pdf_url":"https://arxiv.org/pdf/2309.05831v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.05828v1","updated":"2023-09-11T21:14:55Z","published":"2023-09-11T21:14:55Z","title":"Exploring Geometric Deep Learning For Precipitation Nowcasting","summary":" Precipitation nowcasting (up to a few hours) remains a challenge due to the\nhighly complex local interactions that need to be captured accurately.\nConvolutional Neural Networks rely on convolutional kernels convolving with\ngrid data and the extracted features are trapped by limited receptive field,\ntypically expressed in excessively smooth output compared to ground truth. Thus\nthey lack the capacity to model complex spatial relationships among the grids.\nGeometric deep learning aims to generalize neural network models to\nnon-Euclidean domains. Such models are more flexible in defining nodes and\nedges and can effectively capture dynamic spatial relationship among\ngeographical grids. Motivated by this, we explore a geometric deep\nlearning-based temporal Graph Convolutional Network (GCN) for precipitation\nnowcasting. The adjacency matrix that simulates the interactions among grid\ncells is learned automatically by minimizing the L1 loss between prediction and\nground truth pixel value during the training procedure. Then, the spatial\nrelationship is refined by GCN layers while the temporal information is\nextracted by 1D convolution with various kernel lengths. The neighboring\ninformation is fed as auxiliary input layers to improve the final result. We\ntest the model on sequences of radar reflectivity maps over the Trento/Italy\narea. The results show that GCNs improves the effectiveness of modeling the\nlocal details of the cloud profile as well as the prediction accuracy by\nachieving decreased error measures.\n","authors":["Shan Zhao","Sudipan Saha","Zhitong Xiong","Niklas Boers","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.05828v1.pdf","comment":"submitted and accepted in IGARSS2023"},{"id":"http://arxiv.org/abs/2309.05826v1","updated":"2023-09-11T21:11:48Z","published":"2023-09-11T21:11:48Z","title":"KD-FixMatch: Knowledge Distillation Siamese Neural Networks","summary":" Semi-supervised learning (SSL) has become a crucial approach in deep learning\nas a way to address the challenge of limited labeled data. The success of deep\nneural networks heavily relies on the availability of large-scale high-quality\nlabeled data. However, the process of data labeling is time-consuming and\nunscalable, leading to shortages in labeled data. SSL aims to tackle this\nproblem by leveraging additional unlabeled data in the training process. One of\nthe popular SSL algorithms, FixMatch, trains identical weight-sharing teacher\nand student networks simultaneously using a siamese neural network (SNN).\nHowever, it is prone to performance degradation when the pseudo labels are\nheavily noisy in the early training stage. We present KD-FixMatch, a novel SSL\nalgorithm that addresses the limitations of FixMatch by incorporating knowledge\ndistillation. The algorithm utilizes a combination of sequential and\nsimultaneous training of SNNs to enhance performance and reduce performance\ndegradation. Firstly, an outer SNN is trained using labeled and unlabeled data.\nAfter that, the network of the well-trained outer SNN generates pseudo labels\nfor the unlabeled data, from which a subset of unlabeled data with trusted\npseudo labels is then carefully created through high-confidence sampling and\ndeep embedding clustering. Finally, an inner SNN is trained with the labeled\ndata, the unlabeled data, and the subset of unlabeled data with trusted pseudo\nlabels. Experiments on four public data sets demonstrate that KD-FixMatch\noutperforms FixMatch in all cases. Our results indicate that KD-FixMatch has a\nbetter training starting point that leads to improved model performance\ncompared to FixMatch.\n","authors":["Chien-Chih Wang","Shaoyuan Xu","Jinmiao Fu","Yang Liu","Bryan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05826v1.pdf","comment":"5 pages, 1 figure, 5 tables. To be published in ICIP 2023"},{"id":"http://arxiv.org/abs/2307.13704v3","updated":"2023-09-11T21:02:04Z","published":"2023-07-21T18:06:43Z","title":"eXplainable Artificial Intelligence (XAI) in aging clock models","summary":" eXplainable Artificial Intelligence (XAI) is a rapidly progressing field of\nmachine learning, aiming to unravel the predictions of complex models. XAI is\nespecially required in sensitive applications, e.g. in health care, when\ndiagnosis, recommendations and treatment choices might rely on the decisions\nmade by artificial intelligence systems. AI approaches have become widely used\nin aging research as well, in particular, in developing biological clock models\nand identifying biomarkers of aging and age-related diseases. However, the\npotential of XAI here awaits to be fully appreciated. We discuss the\napplication of XAI for developing the \"aging clocks\" and present a\ncomprehensive analysis of the literature categorized by the focus on particular\nphysiological systems.\n","authors":["Alena Kalyakulina","Igor Yusipov","Alexey Moskalev","Claudio Franceschi","Mikhail Ivanchenko"],"pdf_url":"https://arxiv.org/pdf/2307.13704v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05823v1","updated":"2023-09-11T21:01:11Z","published":"2023-09-11T21:01:11Z","title":"Ensemble-based modeling abstractions for modern self-optimizing systems","summary":" In this paper, we extend our ensemble-based component model DEECo with the\ncapability to use machine-learning and optimization heuristics in establishing\nand reconfiguration of autonomic component ensembles. We show how to capture\nthese concepts on the model level and give an example of how such a model can\nbe beneficially used for modeling access-control related problem in the\nIndustry 4.0 settings. We argue that incorporating machine-learning and\noptimization heuristics is a key feature for modern smart systems which are to\nlearn over the time and optimize their behavior at runtime to deal with\nuncertainty in their environment.\n","authors":["Michal Töpfer","Milad Abdullah","Tomáš Bureš","Petr Hnětynka","Martin Kruliš"],"pdf_url":"https://arxiv.org/pdf/2309.05823v1.pdf","comment":"This is the authors' version of the paper - M. T\\\"opfer, M. Abdullah,\n T. Bure\\v{s}, P. Hn\\v{e}tynka, M. Kruli\\v{s}: Ensemble-Based Modeling\n Abstractions for Modern Self-optimizing Systems, in Proceedings of ISOLA\n 2022, Rhodes, Greece, pp. 318-334, 2022. The final authenticated publication\n is available online at https://doi.org/10.1007/978-3-031-19759-8_20"},{"id":"http://arxiv.org/abs/2006.05630v7","updated":"2023-09-11T20:54:49Z","published":"2020-06-10T03:11:40Z","title":"Distributionally Robust Batch Contextual Bandits","summary":" Policy learning using historical observational data is an important problem\nthat has found widespread applications. Examples include selecting offers,\nprices, advertisements to send to customers, as well as selecting which\nmedication to prescribe to a patient. However, existing literature rests on the\ncrucial assumption that the future environment where the learned policy will be\ndeployed is the same as the past environment that has generated the data -- an\nassumption that is often false or too coarse an approximation. In this paper,\nwe lift this assumption and aim to learn a distributionally robust policy with\nincomplete observational data. We first present a policy evaluation procedure\nthat allows us to assess how well the policy does under the worst-case\nenvironment shift. We then establish a central limit theorem type guarantee for\nthis proposed policy evaluation scheme. Leveraging this evaluation scheme, we\nfurther propose a novel learning algorithm that is able to learn a policy that\nis robust to adversarial perturbations and unknown covariate shifts with a\nperformance guarantee based on the theory of uniform convergence. Finally, we\nempirically test the effectiveness of our proposed algorithm in synthetic\ndatasets and demonstrate that it provides the robustness that is missing using\nstandard policy learning algorithms. We conclude the paper by providing a\ncomprehensive application of our methods in the context of a real-world voting\ndataset.\n","authors":["Nian Si","Fan Zhang","Zhengyuan Zhou","Jose Blanchet"],"pdf_url":"https://arxiv.org/pdf/2006.05630v7.pdf","comment":"The short version has been accepted in ICML 2020"},{"id":"http://arxiv.org/abs/2309.05812v1","updated":"2023-09-11T20:29:38Z","published":"2023-09-11T20:29:38Z","title":"Interpretable learning of effective dynamics for multiscale systems","summary":" The modeling and simulation of high-dimensional multiscale systems is a\ncritical challenge across all areas of science and engineering. It is broadly\nbelieved that even with today's computer advances resolving all spatiotemporal\nscales described by the governing equations remains a remote target. This\nrealization has prompted intense efforts to develop model order reduction\ntechniques. In recent years, techniques based on deep recurrent neural networks\nhave produced promising results for the modeling and simulation of complex\nspatiotemporal systems and offer large flexibility in model development as they\ncan incorporate experimental and computational data. However, neural networks\nlack interpretability, which limits their utility and generalizability across\ncomplex systems. Here we propose a novel framework of Interpretable Learning\nEffective Dynamics (iLED) that offers comparable accuracy to state-of-the-art\nrecurrent neural network-based approaches while providing the added benefit of\ninterpretability. The iLED framework is motivated by Mori-Zwanzig and Koopman\noperator theory, which justifies the choice of the specific architecture. We\ndemonstrate the effectiveness of the proposed framework in simulations of three\nbenchmark multiscale systems. Our results show that the iLED framework can\ngenerate accurate predictions and obtain interpretable dynamics, making it a\npromising approach for solving high-dimensional multiscale systems.\n","authors":["Emmanuel Menier","Sebastian Kaltenbach","Mouadh Yagoubi","Marc Schoenauer","Petros Koumoutsakos"],"pdf_url":"https://arxiv.org/pdf/2309.05812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05811v1","updated":"2023-09-11T20:28:43Z","published":"2023-09-11T20:28:43Z","title":"Predicting the Radiation Field of Molecular Clouds using Denoising\n Diffusion Probabilistic Models","summary":" Accurately quantifying the impact of radiation feedback in star formation is\nchallenging. To address this complex problem, we employ deep learning\ntechniques, denoising diffusion probabilistic models (DDPMs), to predict the\ninterstellar radiation field (ISRF) strength based on three-band dust emission\nat 4.5 \\um, 24 \\um, and 250 \\um. We adopt magnetohydrodynamic simulations from\nthe STARFORGE (STAR FORmation in Gaseous Environments) project that model star\nformation and giant molecular cloud (GMC) evolution. We generate synthetic dust\nemission maps matching observed spectral energy distributions in the Monoceros\nR2 (MonR2) GMC. We train DDPMs to estimate the ISRF using synthetic three-band\ndust emission. The dispersion between the predictions and true values is within\na factor of 0.1 for the test set. We extended our assessment of the diffusion\nmodel to include new simulations with varying physical parameters. While there\nis a consistent offset observed in these out-of-distribution simulations, the\nmodel effectively constrains the relative intensity to within a factor of 2.\nMeanwhile, our analysis reveals weak correlation between the ISRF solely\nderived from dust temperature and the actual ISRF. We apply our trained model\nto predict the ISRF in MonR2, revealing a correspondence between intense ISRF,\nbright sources, and high dust emission, confirming the model's ability to\ncapture ISRF variations. Our model robustly predicts radiation feedback\ndistribution, even in complex, poorly constrained ISRF environments like those\ninfluenced by nearby star clusters. However, precise ISRF predictions require\nan accurate training dataset mirroring the target molecular cloud's unique\nphysical conditions.\n","authors":["Duo Xu","Stella Offner","Robert Gutermuth","Michael Grudic","David Guszejnov","Philip Hopkins"],"pdf_url":"https://arxiv.org/pdf/2309.05811v1.pdf","comment":"Revised submission to ApJ following referee's comments"},{"id":"http://arxiv.org/abs/2309.05810v1","updated":"2023-09-11T20:28:18Z","published":"2023-09-11T20:28:18Z","title":"SHIFT3D: Synthesizing Hard Inputs For Tricking 3D Detectors","summary":" We present SHIFT3D, a differentiable pipeline for generating 3D shapes that\nare structurally plausible yet challenging to 3D object detectors. In\nsafety-critical applications like autonomous driving, discovering such novel\nchallenging objects can offer insight into unknown vulnerabilities of 3D\ndetectors. By representing objects with a signed distanced function (SDF), we\nshow that gradient error signals allow us to smoothly deform the shape or pose\nof a 3D object in order to confuse a downstream 3D detector. Importantly, the\nobjects generated by SHIFT3D physically differ from the baseline object yet\nretain a semantically recognizable shape. Our approach provides interpretable\nfailure modes for modern 3D object detectors, and can aid in preemptive\ndiscovery of potential safety risks within 3D perception systems before these\nrisks become critical failures.\n","authors":["Hongge Chen","Zhao Chen","Gregory P. Meyer","Dennis Park","Carl Vondrick","Ashish Shrivastava","Yuning Chai"],"pdf_url":"https://arxiv.org/pdf/2309.05810v1.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05809v1","updated":"2023-09-11T20:26:40Z","published":"2023-09-11T20:26:40Z","title":"Divergences in Color Perception between Deep Neural Networks and Humans","summary":" Deep neural networks (DNNs) are increasingly proposed as models of human\nvision, bolstered by their impressive performance on image classification and\nobject recognition tasks. Yet, the extent to which DNNs capture fundamental\naspects of human vision such as color perception remains unclear. Here, we\ndevelop novel experiments for evaluating the perceptual coherence of color\nembeddings in DNNs, and we assess how well these algorithms predict human color\nsimilarity judgments collected via an online survey. We find that\nstate-of-the-art DNN architectures $-$ including convolutional neural networks\nand vision transformers $-$ provide color similarity judgments that strikingly\ndiverge from human color judgments of (i) images with controlled color\nproperties, (ii) images generated from online searches, and (iii) real-world\nimages from the canonical CIFAR-10 dataset. We compare DNN performance against\nan interpretable and cognitively plausible model of color perception based on\nwavelet decomposition, inspired by foundational theories in computational\nneuroscience. While one deep learning model $-$ a convolutional DNN trained on\na style transfer task $-$ captures some aspects of human color perception, our\nwavelet algorithm provides more coherent color embeddings that better predict\nhuman color judgments compared to all DNNs we examine. These results hold when\naltering the high-level visual task used to train similar DNN architectures\n(e.g., image classification versus image segmentation), as well as when\nexamining the color embeddings of different layers in a given DNN architecture.\nThese findings break new ground in the effort to analyze the perceptual\nrepresentations of machine learning algorithms and to improve their ability to\nserve as cognitively plausible models of human vision. Implications for machine\nlearning, human perception, and embodied cognition are discussed.\n","authors":["Ethan O. Nadler","Elise Darragh-Ford","Bhargav Srinivasa Desikan","Christian Conaway","Mark Chu","Tasker Hull","Douglas Guilbeault"],"pdf_url":"https://arxiv.org/pdf/2309.05809v1.pdf","comment":"22 pages, 8 figures + SI Appendix; to appear in Cognition"},{"id":"http://arxiv.org/abs/2309.03905v2","updated":"2023-09-11T20:25:16Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v2.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"},{"id":"http://arxiv.org/abs/2309.05805v1","updated":"2023-09-11T20:17:11Z","published":"2023-09-11T20:17:11Z","title":"Online ML Self-adaptation in Face of Traps","summary":" Online machine learning (ML) is often used in self-adaptive systems to\nstrengthen the adaptation mechanism and improve the system utility. Despite\nsuch benefits, applying online ML for self-adaptation can be challenging, and\nnot many papers report its limitations. Recently, we experimented with applying\nonline ML for self-adaptation of a smart farming scenario and we had faced\nseveral unexpected difficulties -- traps -- that, to our knowledge, are not\ndiscussed enough in the community. In this paper, we report our experience with\nthese traps. Specifically, we discuss several traps that relate to the\nspecification and online training of the ML-based estimators, their impact on\nself-adaptation, and the approach used to evaluate the estimators. Our overview\nof these traps provides a list of lessons learned, which can serve as guidance\nfor other researchers and practitioners when applying online ML for\nself-adaptation.\n","authors":["Michal Töpfer","František Plášil","Tomáš Bureš","Petr Hnětynka","Martin Kruliš","Danny Weyns"],"pdf_url":"https://arxiv.org/pdf/2309.05805v1.pdf","comment":"This is the authors' version of the paper M. T\\\"opfer, F.\n Pl\\'a\\v{s}il, T. Bure\\v{s}, P. Hn\\v{e}tynka, M. Kruli\\v{s}, D. Weyns: Online\n ML Self-adaptation in Face of Traps, accepted for publication in Proceedings\n of ACSOS 2023, Toronto, Canada"},{"id":"http://arxiv.org/abs/2010.12995v2","updated":"2023-09-11T20:13:57Z","published":"2020-10-24T21:41:21Z","title":"Out-of-distribution detection for regression tasks: parameter versus\n predictor entropy","summary":" It is crucial to detect when an instance lies downright too far from the\ntraining samples for the machine learning model to be trusted, a challenge\nknown as out-of-distribution (OOD) detection. For neural networks, one approach\nto this task consists of learning a diversity of predictors that all can\nexplain the training data. This information can be used to estimate the\nepistemic uncertainty at a given newly observed instance in terms of a measure\nof the disagreement of the predictions. Evaluation and certification of the\nability of a method to detect OOD require specifying instances which are likely\nto occur in deployment yet on which no prediction is available. Focusing on\nregression tasks, we choose a simple yet insightful model for this OOD\ndistribution and conduct an empirical evaluation of the ability of various\nmethods to discriminate OOD samples from the data. Moreover, we exhibit\nevidence that a diversity of parameters may fail to translate to a diversity of\npredictors. Based on the choice of an OOD distribution, we propose a new way of\nestimating the entropy of a distribution on predictors based on nearest\nneighbors in function space. This leads to a variational objective which,\ncombined with the family of distributions given by a generative neural network,\nsystematically produces a diversity of predictors that provides a robust way to\ndetect OOD samples.\n","authors":["Yann Pequignot","Mathieu Alain","Patrick Dallaire","Alireza Yeganehparast","Pascal Germain","Josée Desharnais","François Laviolette"],"pdf_url":"https://arxiv.org/pdf/2010.12995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05803v1","updated":"2023-09-11T20:13:47Z","published":"2023-09-11T20:13:47Z","title":"Revisiting Energy Based Models as Policies: Ranking Noise Contrastive\n Estimation and Interpolating Energy Models","summary":" A crucial design decision for any robot learning pipeline is the choice of\npolicy representation: what type of model should be used to generate the next\nset of robot actions? Owing to the inherent multi-modal nature of many robotic\ntasks, combined with the recent successes in generative modeling, researchers\nhave turned to state-of-the-art probabilistic models such as diffusion models\nfor policy representation. In this work, we revisit the choice of energy-based\nmodels (EBM) as a policy class. We show that the prevailing folklore -- that\nenergy models in high dimensional continuous spaces are impractical to train --\nis false. We develop a practical training objective and algorithm for energy\nmodels which combines several key ingredients: (i) ranking noise contrastive\nestimation (R-NCE), (ii) learnable negative samplers, and (iii) non-adversarial\njoint training. We prove that our proposed objective function is asymptotically\nconsistent and quantify its limiting variance. On the other hand, we show that\nthe Implicit Behavior Cloning (IBC) objective is actually biased even at the\npopulation level, providing a mathematical explanation for the poor performance\nof IBC trained energy policies in several independent follow-up works. We\nfurther extend our algorithm to learn a continuous stochastic process that\nbridges noise and data, modeling this process with a family of EBMs indexed by\nscale variable. In doing so, we demonstrate that the core idea behind recent\nprogress in generative modeling is actually compatible with EBMs. Altogether,\nour proposed training algorithms enable us to train energy-based models as\npolicies which compete with -- and even outperform -- diffusion models and\nother state-of-the-art approaches in several challenging multi-modal\nbenchmarks: obstacle avoidance path planning and contact-rich block pushing.\n","authors":["Sumeet Singh","Stephen Tu","Vikas Sindhwani"],"pdf_url":"https://arxiv.org/pdf/2309.05803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05798v1","updated":"2023-09-11T20:06:00Z","published":"2023-09-11T20:06:00Z","title":"Enhancing Hyperedge Prediction with Context-Aware Self-Supervised\n Learning","summary":" Hypergraphs can naturally model group-wise relations (e.g., a group of users\nwho co-purchase an item) as hyperedges. Hyperedge prediction is to predict\nfuture or unobserved hyperedges, which is a fundamental task in many real-world\napplications (e.g., group recommendation). Despite the recent breakthrough of\nhyperedge prediction methods, the following challenges have been rarely\nstudied: (C1) How to aggregate the nodes in each hyperedge candidate for\naccurate hyperedge prediction? and (C2) How to mitigate the inherent data\nsparsity problem in hyperedge prediction? To tackle both challenges together,\nin this paper, we propose a novel hyperedge prediction framework (CASH) that\nemploys (1) context-aware node aggregation to precisely capture complex\nrelations among nodes in each hyperedge for (C1) and (2) self-supervised\ncontrastive learning in the context of hyperedge prediction to enhance\nhypergraph representations for (C2). Furthermore, as for (C2), we propose a\nhyperedge-aware augmentation method to fully exploit the latent semantics\nbehind the original hypergraph and consider both node-level and group-level\ncontrasts (i.e., dual contrasts) for better node and hyperedge representations.\nExtensive experiments on six real-world hypergraphs reveal that CASH\nconsistently outperforms all competing methods in terms of the accuracy in\nhyperedge prediction and each of the proposed strategies is effective in\nimproving the model accuracy of CASH. For the detailed information of CASH, we\nprovide the code and datasets at: https://github.com/yy-ko/cash.\n","authors":["Yunyong Ko","Hanghang Tong","Sang-Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2309.05798v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.15907v4","updated":"2023-09-11T20:04:34Z","published":"2023-06-28T04:15:01Z","title":"Deep Learning Models for Flood Predictions in South Florida","summary":" Simulating and predicting water levels in river systems is essential for\nflood warnings, hydraulic operations, and flood mitigations. In the engineering\nfield, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed\nphysics-based hydrological and hydraulic computational models to simulate the\nentire watershed, thereby predicting the water stage at any point in the\nsystem. However, these physics-based models are computationally intensive,\nespecially for large watersheds and for longer simulations. To overcome this\nproblem, we train several deep learning (DL) models for use as surrogate models\nto rapidly predict the water stage. The downstream stage of the Miami River in\nSouth Florida is chosen as a case study for this paper. The dataset is from\nJanuary 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of\nthe South Florida Water Management District (SFWMD). Extensive experiments show\nthat the performance of the DL models is comparable to that of the\nphysics-based models, even during extreme precipitation conditions (i.e.,\ntropical storms). Furthermore, we study the decline in prediction accuracy of\nthe DL models with an increase in prediction lengths. In order to predict the\nwater stage in the future, our DL models use measured variables of the river\nsystem from the recent past as well as covariates that can be reliably\npredicted in the near future. In summary, the deep learning models achieve\ncomparable or better error rates with at least 1000x speedup in comparison to\nthe physics-based models.\n","authors":["Jimeng Shi","Zeda Yin","Rukmangadh Myana","Khandker Ishtiaq","Anupama John","Jayantha Obeysekera","Arturo Leon","Giri Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2306.15907v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05795v1","updated":"2023-09-11T20:03:25Z","published":"2023-09-11T20:03:25Z","title":"On the Fine-Grained Hardness of Inverting Generative Models","summary":" The objective of generative model inversion is to identify a size-$n$ latent\nvector that produces a generative model output that closely matches a given\ntarget. This operation is a core computational primitive in numerous modern\napplications involving computer vision and NLP. However, the problem is known\nto be computationally challenging and NP-hard in the worst case. This paper\naims to provide a fine-grained view of the landscape of computational hardness\nfor this problem. We establish several new hardness lower bounds for both exact\nand approximate model inversion. In exact inversion, the goal is to determine\nwhether a target is contained within the range of a given generative model.\nUnder the strong exponential time hypothesis (SETH), we demonstrate that the\ncomputational complexity of exact inversion is lower bounded by $\\Omega(2^n)$\nvia a reduction from $k$-SAT; this is a strengthening of known results. For the\nmore practically relevant problem of approximate inversion, the goal is to\ndetermine whether a point in the model range is close to a given target with\nrespect to the $\\ell_p$-norm. When $p$ is a positive odd integer, under SETH,\nwe provide an $\\Omega(2^n)$ complexity lower bound via a reduction from the\nclosest vectors problem (CVP). Finally, when $p$ is even, under the exponential\ntime hypothesis (ETH), we provide a lower bound of $2^{\\Omega (n)}$ via a\nreduction from Half-Clique and Vertex-Cover.\n","authors":["Feyza Duman Keles","Chinmay Hegde"],"pdf_url":"https://arxiv.org/pdf/2309.05795v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2309.05787v1","updated":"2023-09-11T19:35:12Z","published":"2023-09-11T19:35:12Z","title":"Adaptive User-centered Neuro-symbolic Learning for Multimodal\n Interaction with Autonomous Systems","summary":" Recent advances in machine learning, particularly deep learning, have enabled\nautonomous systems to perceive and comprehend objects and their environments in\na perceptual subsymbolic manner. These systems can now perform object\ndetection, sensor data fusion, and language understanding tasks. However, there\nis a growing need to enhance these systems to understand objects and their\nenvironments more conceptually and symbolically. It is essential to consider\nboth the explicit teaching provided by humans (e.g., describing a situation or\nexplaining how to act) and the implicit teaching obtained by observing human\nbehavior (e.g., through the system's sensors) to achieve this level of powerful\nartificial intelligence. Thus, the system must be designed with multimodal\ninput and output capabilities to support implicit and explicit interaction\nmodels. In this position paper, we argue for considering both types of inputs,\nas well as human-in-the-loop and incremental learning techniques, for advancing\nthe field of artificial intelligence and enabling autonomous systems to learn\nlike humans. We propose several hypotheses and design guidelines and highlight\na use case from related work to achieve this goal.\n","authors":["Amr Gomaa","Michael Feld"],"pdf_url":"https://arxiv.org/pdf/2309.05787v1.pdf","comment":"AI&HCI Workshop accepted paper at ICML2023 and accepted at ICMI2023\n Blue Sky Papers. arXiv admin note: text overlap with arXiv:2211.03539"},{"id":"http://arxiv.org/abs/2309.04001v2","updated":"2023-09-11T19:34:43Z","published":"2023-09-07T20:07:57Z","title":"Multimodal Transformer for Material Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different combinations of\nfour different modalities: RGB, Angle of Linear Polarization (AoLP), Degree of\nLinear Polarization (DoLP) and Near-Infrared (NIR). We also propose a new model\nnamed Multi-Modal Segmentation Transformer (MMSFormer) that incorporates the\nproposed fusion strategy to perform multimodal material segmentation. MMSFormer\nachieves 52.05% mIoU outperforming the current state-of-the-art on Multimodal\nMaterial Segmentation (MCubeS) dataset. For instance, our method provides\nsignificant improvement in detecting gravel (+10.4%) and human (+9.1%) classes.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2109.03890v4","updated":"2023-09-11T19:33:45Z","published":"2021-09-08T19:33:52Z","title":"Axiomatic Aggregations of Abductive Explanations","summary":" The recent criticisms of the robustness of post hoc model approximation\nexplanation methods (like LIME and SHAP) have led to the rise of model-precise\nabductive explanations. For each data point, abductive explanations provide a\nminimal subset of features that are sufficient to generate the outcome. While\ntheoretically sound and rigorous, abductive explanations suffer from a major\nissue -- there can be several valid abductive explanations for the same data\npoint. In such cases, providing a single abductive explanation can be\ninsufficient; on the other hand, providing all valid abductive explanations can\nbe incomprehensible due to their size. In this work, we solve this issue by\naggregating the many possible abductive explanations into feature importance\nscores. We propose three aggregation methods: two based on power indices from\ncooperative game theory and a third based on a well-known measure of causal\nstrength. We characterize these three methods axiomatically, showing that each\nof them uniquely satisfies a set of desirable properties. We also evaluate them\non multiple datasets and show that these explanations are robust to the attacks\nthat fool SHAP and LIME.\n","authors":["Gagan Biradar","Yacine Izza","Elita Lobo","Vignesh Viswanathan","Yair Zick"],"pdf_url":"https://arxiv.org/pdf/2109.03890v4.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.05658v1","updated":"2023-09-11T17:53:14Z","published":"2023-09-11T17:53:14Z","title":"From Capture to Display: A Survey on Volumetric Video","summary":" Volumetric video, which offers immersive viewing experiences, is gaining\nincreasing prominence. With its six degrees of freedom, it provides viewers\nwith greater immersion and interactivity compared to traditional videos.\nDespite their potential, volumetric video services poses significant\nchallenges. This survey conducts a comprehensive review of the existing\nliterature on volumetric video. We firstly provide a general framework of\nvolumetric video services, followed by a discussion on prerequisites for\nvolumetric video, encompassing representations, open datasets, and quality\nassessment metrics. Then we delve into the current methodologies for each stage\nof the volumetric video service pipeline, detailing capturing, compression,\ntransmission, rendering, and display techniques. Lastly, we explore various\napplications enabled by this pioneering technology and we present an array of\nresearch challenges and opportunities in the domain of volumetric video\nservices. This survey aspires to provide a holistic understanding of this\nburgeoning field and shed light on potential future research trajectories,\naiming to bring the vision of volumetric video to fruition.\n","authors":["Yili Jin","Kaiyuan Hu","Junhua Liu","Fangxin Wang","Xue Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05658v1.pdf","comment":"Submitted"},{"id":"http://arxiv.org/abs/2309.05590v1","updated":"2023-09-11T16:17:50Z","published":"2023-09-11T16:17:50Z","title":"Temporal Action Localization with Enhanced Instant Discriminability","summary":" Temporal action detection (TAD) aims to detect all action boundaries and\ntheir corresponding categories in an untrimmed video. The unclear boundaries of\nactions in videos often result in imprecise predictions of action boundaries by\nexisting methods. To resolve this issue, we propose a one-stage framework named\nTriDet. First, we propose a Trident-head to model the action boundary via an\nestimated relative probability distribution around the boundary. Then, we\nanalyze the rank-loss problem (i.e. instant discriminability deterioration) in\ntransformer-based methods and propose an efficient scalable-granularity\nperception (SGP) layer to mitigate this issue. To further push the limit of\ninstant discriminability in the video backbone, we leverage the strong\nrepresentation capability of pretrained large models and investigate their\nperformance on TAD. Last, considering the adequate spatial-temporal context for\nclassification, we design a decoupled feature pyramid network with separate\nfeature pyramids to incorporate rich spatial context from the large model for\nlocalization. Experimental results demonstrate the robustness of TriDet and its\nstate-of-the-art performance on multiple TAD datasets, including hierarchical\n(multilabel) TAD datasets.\n","authors":["Dingfeng Shi","Qiong Cao","Yujie Zhong","Shan An","Jian Cheng","Haogang Zhu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2309.05590v1.pdf","comment":"An extended version of the CVPR paper arXiv:2303.07347, submitted to\n IJCV"},{"id":"http://arxiv.org/abs/2309.05451v1","updated":"2023-09-11T13:44:46Z","published":"2023-09-11T13:44:46Z","title":"Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal\n Retrieval","summary":" Current research on cross-modal retrieval is mostly English-oriented, as the\navailability of a large number of English-oriented human-labeled\nvision-language corpora. In order to break the limit of non-English labeled\ndata, cross-lingual cross-modal retrieval (CCR) has attracted increasing\nattention. Most CCR methods construct pseudo-parallel vision-language corpora\nvia Machine Translation (MT) to achieve cross-lingual transfer. However, the\ntranslated sentences from MT are generally imperfect in describing the\ncorresponding visual contents. Improperly assuming the pseudo-parallel data are\ncorrectly correlated will make the networks overfit to the noisy\ncorrespondence. Therefore, we propose Dual-view Curricular Optimal Transport\n(DCOT) to learn with noisy correspondence in CCR. In particular, we quantify\nthe confidence of the sample pair correlation with optimal transport theory\nfrom both the cross-lingual and cross-modal views, and design dual-view\ncurriculum learning to dynamically model the transportation costs according to\nthe learning stage of the two views. Extensive experiments are conducted on two\nmultilingual image-text datasets and one video-text dataset, and the results\ndemonstrate the effectiveness and robustness of the proposed method. Besides,\nour proposed method also shows a good expansibility to cross-lingual image-text\nbaselines and a decent generalization on out-of-domain data.\n","authors":["Yabing Wang","Shuhui Wang","Hao Luo","Jianfeng Dong","Fan Wang","Meng Han","Xun Wang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05382v1","updated":"2023-09-11T11:10:05Z","published":"2023-09-11T11:10:05Z","title":"CANF-VC++: Enhancing Conditional Augmented Normalizing Flows for Video\n Compression with Advanced Techniques","summary":" Video has become the predominant medium for information dissemination,\ndriving the need for efficient video codecs. Recent advancements in learned\nvideo compression have shown promising results, surpassing traditional codecs\nin terms of coding efficiency. However, challenges remain in integrating\nfragmented techniques and incorporating new tools into existing codecs. In this\npaper, we comprehensively review the state-of-the-art CANF-VC codec and propose\nCANF-VC++, an enhanced version that addresses these challenges. We\nsystematically explore architecture design, reference frame type, training\nprocedure, and entropy coding efficiency, leading to substantial coding\nimprovements. CANF-VC++ achieves significant Bj{\\o}ntegaard-Delta rate savings\non conventional datasets UVG, HEVC Class B and MCL-JCV, outperforming the\nbaseline CANF-VC and even the H.266 reference software VTM. Our work\ndemonstrates the potential of integrating advancements in video compression and\nserves as inspiration for future research in the field.\n","authors":["Peng-Yu Chen","Wen-Hsiao Peng"],"pdf_url":"https://arxiv.org/pdf/2309.05382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05281v1","updated":"2023-09-11T07:36:16Z","published":"2023-09-11T07:36:16Z","title":"Class-Incremental Grouping Network for Continual Audio-Visual Learning","summary":" Continual learning is a challenging problem in which models need to be\ntrained on non-stationary data across sequential tasks for class-incremental\nlearning. While previous methods have focused on using either regularization or\nrehearsal-based frameworks to alleviate catastrophic forgetting in image\nclassification, they are limited to a single modality and cannot learn compact\nclass-aware cross-modal representations for continual audio-visual learning. To\naddress this gap, we propose a novel class-incremental grouping network (CIGN)\nthat can learn category-wise semantic features to achieve continual\naudio-visual learning. Our CIGN leverages learnable audio-visual class tokens\nand audio-visual grouping to continually aggregate class-aware features.\nAdditionally, it utilizes class tokens distillation and continual grouping to\nprevent forgetting parameters learned from previous tasks, thereby improving\nthe model's ability to capture discriminative audio-visual categories. We\nconduct extensive experiments on VGGSound-Instruments, VGGSound-100, and\nVGG-Sound Sources benchmarks. Our experimental results demonstrate that the\nCIGN achieves state-of-the-art audio-visual class-incremental learning\nperformance. Code is available at https://github.com/stoneMo/CIGN.\n","authors":["Shentong Mo","Weiguo Pian","Yapeng Tian"],"pdf_url":"https://arxiv.org/pdf/2309.05281v1.pdf","comment":"ICCV 2023. arXiv admin note: text overlap with arXiv:2303.17056"},{"id":"http://arxiv.org/abs/2205.07611v3","updated":"2023-09-11T04:23:25Z","published":"2022-05-16T12:14:03Z","title":"Noise-Tolerant Learning for Audio-Visual Action Recognition","summary":" Recently, video recognition is emerging with the help of multi-modal\nlearning, which focuses on integrating distinct modalities to improve the\nperformance or robustness of the model. Although various multi-modal learning\nmethods have been proposed and offer remarkable recognition results, almost all\nof these methods rely on high-quality manual annotations and assume that\nmodalities among multi-modal data provide semantically relevant information.\nUnfortunately, the widely used video datasets are usually coarse-annotated or\ncollected from the Internet. Thus, it inevitably contains a portion of noisy\nlabels and noisy correspondence. To address this challenge, we use the\naudio-visual action recognition task as a proxy and propose a noise-tolerant\nlearning framework to find anti-interference model parameters against both\nnoisy labels and noisy correspondence. Specifically, our method consists of two\nphases that aim to rectify noise by the inherent correlation between\nmodalities. First, a noise-tolerant contrastive training phase is performed to\nmake the model immune to the possible noisy-labeled data. To alleviate the\ninfluence of noisy correspondence, we propose a cross-modal noise estimation\ncomponent to adjust the consistency between different modalities. As the noisy\ncorrespondence existed at the instance level, we further propose a\ncategory-level contrastive loss to reduce its interference. Second, in the\nhybrid-supervised training phase, we calculate the distance metric among\nfeatures to obtain corrected labels, which are used as complementary\nsupervision to guide the training. Extensive experiments on a wide range of\nnoisy levels demonstrate that our method significantly improves the robustness\nof the action recognition model and surpasses the baselines by a clear margin.\n","authors":["Haochen Han","Qinghua Zheng","Minnan Luo","Kaiyao Miao","Feng Tian","Yan Chen"],"pdf_url":"https://arxiv.org/pdf/2205.07611v3.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.03905v2","updated":"2023-09-11T20:25:16Z","published":"2023-09-07T17:59:45Z","title":"ImageBind-LLM: Multi-modality Instruction Tuning","summary":" We present ImageBind-LLM, a multi-modality instruction tuning method of large\nlanguage models (LLMs) via ImageBind. Existing works mainly focus on language\nand image instruction tuning, different from which, our ImageBind-LLM can\nrespond to multi-modality conditions, including audio, 3D point clouds, video,\nand their embedding-space arithmetic by only image-text alignment training.\nDuring training, we adopt a learnable bind network to align the embedding space\nbetween LLaMA and ImageBind's image encoder. Then, the image features\ntransformed by the bind network are added to word tokens of all layers in\nLLaMA, which progressively injects visual instructions via an attention-free\nand zero-initialized gating mechanism. Aided by the joint embedding of\nImageBind, the simple image-text training enables our model to exhibit superior\nmulti-modality instruction-following capabilities. During inference, the\nmulti-modality inputs are fed into the corresponding ImageBind encoders, and\nprocessed by a proposed visual cache model for further cross-modal embedding\nenhancement. The training-free cache model retrieves from three million image\nfeatures extracted by ImageBind, which effectively mitigates the\ntraining-inference modality discrepancy. Notably, with our approach,\nImageBind-LLM can respond to instructions of diverse modalities and demonstrate\nsignificant language generation quality. Code is released at\nhttps://github.com/OpenGVLab/LLaMA-Adapter.\n","authors":["Jiaming Han","Renrui Zhang","Wenqi Shao","Peng Gao","Peng Xu","Han Xiao","Kaipeng Zhang","Chris Liu","Song Wen","Ziyu Guo","Xudong Lu","Shuai Ren","Yafei Wen","Xiaoxin Chen","Xiangyu Yue","Hongsheng Li","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.03905v2.pdf","comment":"Code is available at https://github.com/OpenGVLab/LLaMA-Adapter"}]},"2023-09-10T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.05162v1","updated":"2023-09-10T23:00:35Z","published":"2023-09-10T23:00:35Z","title":"Collecting Visually-Grounded Dialogue with A Game Of Sorts","summary":" An idealized, though simplistic, view of the referring expression production\nand grounding process in (situated) dialogue assumes that a speaker must merely\nappropriately specify their expression so that the target referent may be\nsuccessfully identified by the addressee. However, referring in conversation is\na collaborative process that cannot be aptly characterized as an exchange of\nminimally-specified referring expressions. Concerns have been raised regarding\nassumptions made by prior work on visually-grounded dialogue that reveal an\noversimplified view of conversation and the referential process. We address\nthese concerns by introducing a collaborative image ranking task, a grounded\nagreement game we call \"A Game Of Sorts\". In our game, players are tasked with\nreaching agreement on how to rank a set of images given some sorting criterion\nthrough a largely unrestricted, role-symmetric dialogue. By putting emphasis on\nthe argumentation in this mixed-initiative interaction, we collect discussions\nthat involve the collaborative referential process. We describe results of a\nsmall-scale data collection experiment with the proposed task. All discussed\nmaterials, which includes the collected data, the codebase, and a containerized\nversion of the application, are publicly available.\n","authors":["Bram Willemsen","Dmytro Kalpakchi","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2309.05162v1.pdf","comment":"Published at LREC 2022"},{"id":"http://arxiv.org/abs/2309.05142v1","updated":"2023-09-10T21:23:09Z","published":"2023-09-10T21:23:09Z","title":"Large Language Models for Difficulty Estimation of Foreign Language\n Content with Application to Language Learning","summary":" We use large language models to aid learners enhance proficiency in a foreign\nlanguage. This is accomplished by identifying content on topics that the user\nis interested in, and that closely align with the learner's proficiency level\nin that foreign language. Our work centers on French content, but our approach\nis readily transferable to other languages. Our solution offers several\ndistinctive characteristics that differentiate it from existing\nlanguage-learning solutions, such as, a) the discovery of content across topics\nthat the learner cares about, thus increasing motivation, b) a more precise\nestimation of the linguistic difficulty of the content than traditional\nreadability measures, and c) the availability of both textual and video-based\ncontent. The linguistic complexity of video content is derived from the video\ncaptions. It is our aspiration that such technology will enable learners to\nremain engaged in the language-learning process by continuously adapting the\ntopics and the difficulty of the content to align with the learners' evolving\ninterests and learning objectives.\n","authors":["Michalis Vlachos","Mircea Lungu","Yash Raj Shrestha","Johannes-Rudolf David"],"pdf_url":"https://arxiv.org/pdf/2309.05142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.04476v2","updated":"2023-09-10T20:04:30Z","published":"2022-11-08T19:00:00Z","title":"Discover, Explanation, Improvement: An Automatic Slice Detection\n Framework for Natural Language Processing","summary":" Pretrained natural language processing (NLP) models have achieved high\noverall performance, but they still make systematic errors. Instead of manual\nerror analysis, research on slice detection models (SDM), which automatically\nidentify underperforming groups of datapoints, has caught escalated attention\nin Computer Vision for both understanding model behaviors and providing\ninsights for future model training and designing. However, little research on\nSDM and quantitative evaluation of their effectiveness have been conducted on\nNLP tasks. Our paper fills the gap by proposing a benchmark named \"Discover,\nExplain, Improve (DEIM)\" for classification NLP tasks along with a new SDM\nEdisa. Edisa discovers coherent and underperforming groups of datapoints; DEIM\nthen unites them under human-understandable concepts and provides comprehensive\nevaluation tasks and corresponding quantitative metrics. The evaluation in DEIM\nshows that Edisa can accurately select error-prone datapoints with informative\nsemantic features that summarize error patterns. Detecting difficult datapoints\ndirectly boosts model performance without tuning any original model parameters,\nshowing that discovered slices are actionable for users.\n","authors":["Wenyue Hua","Lifeng Jin","Linfeng Song","Haitao Mi","Yongfeng Zhang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2211.04476v2.pdf","comment":"15 pages, 5 figures, accepted by Transactions of the Association for\n Computational Linguistics"},{"id":"http://arxiv.org/abs/2309.05103v1","updated":"2023-09-10T18:13:11Z","published":"2023-09-10T18:13:11Z","title":"AGent: A Novel Pipeline for Automatically Creating Unanswerable\n Questions","summary":" The development of large high-quality datasets and high-performing models\nhave led to significant advancements in the domain of Extractive Question\nAnswering (EQA). This progress has sparked considerable interest in exploring\nunanswerable questions within the EQA domain. Training EQA models with\nunanswerable questions helps them avoid extracting misleading or incorrect\nanswers for queries that lack valid responses. However, manually annotating\nunanswerable questions is labor-intensive. To address this, we propose AGent, a\nnovel pipeline that automatically creates new unanswerable questions by\nre-matching a question with a context that lacks the necessary information for\na correct answer. In this paper, we demonstrate the usefulness of this AGent\npipeline by creating two sets of unanswerable questions from answerable\nquestions in SQuAD and HotpotQA. These created question sets exhibit low error\nrates. Additionally, models fine-tuned on these questions show comparable\nperformance with those fine-tuned on the SQuAD 2.0 dataset on multiple EQA\nbenchmarks.\n","authors":["Son Quoc Tran","Gia-Huy Do","Phong Nguyen-Thuan Do","Matt Kretchmar","Xinya Du"],"pdf_url":"https://arxiv.org/pdf/2309.05103v1.pdf","comment":"16 pages, 10 tables, 3 figures"},{"id":"http://arxiv.org/abs/2308.13916v3","updated":"2023-09-10T17:42:37Z","published":"2023-08-26T16:51:17Z","title":"Exploring Large Language Models for Knowledge Graph Completion","summary":" Knowledge graphs play a vital role in numerous artificial intelligence tasks,\nyet they frequently face the issue of incompleteness. In this study, we explore\nutilizing Large Language Models (LLM) for knowledge graph completion. We\nconsider triples in knowledge graphs as text sequences and introduce an\ninnovative framework called Knowledge Graph LLM (KG-LLM) to model these\ntriples. Our technique employs entity and relation descriptions of a triple as\nprompts and utilizes the response for predictions. Experiments on various\nbenchmark knowledge graphs demonstrate that our method attains state-of-the-art\nperformance in tasks such as triple classification and relation prediction. We\nalso find that fine-tuning relatively smaller models (e.g., LLaMA-7B,\nChatGLM-6B) outperforms recent ChatGPT and GPT-4.\n","authors":["Liang Yao","Jiazhen Peng","Chengsheng Mao","Yuan Luo"],"pdf_url":"https://arxiv.org/pdf/2308.13916v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.05086v1","updated":"2023-09-10T17:13:25Z","published":"2023-09-10T17:13:25Z","title":"Neural-Hidden-CRF: A Robust Weakly-Supervised Sequence Labeler","summary":" We propose a neuralized undirected graphical model called Neural-Hidden-CRF\nto solve the weakly-supervised sequence labeling problem. Under the umbrella of\nprobabilistic undirected graph theory, the proposed Neural-Hidden-CRF embedded\nwith a hidden CRF layer models the variables of word sequence, latent ground\ntruth sequence, and weak label sequence with the global perspective that\nundirected graphical models particularly enjoy. In Neural-Hidden-CRF, we can\ncapitalize on the powerful language model BERT or other deep models to provide\nrich contextual semantic knowledge to the latent ground truth sequence, and use\nthe hidden CRF layer to capture the internal label dependencies.\nNeural-Hidden-CRF is conceptually simple and empirically powerful. It obtains\nnew state-of-the-art results on one crowdsourcing benchmark and three\nweak-supervision benchmarks, including outperforming the recent advanced model\nCHMM by 2.80 F1 points and 2.23 F1 points in average generalization and\ninference performance, respectively.\n","authors":["Zhijun Chen","Hailong Sun","Wanhao Zhang","Chunyi Xu","Qianren Mao","Pengpeng Chen"],"pdf_url":"https://arxiv.org/pdf/2309.05086v1.pdf","comment":"13 pages, 4 figures, accepted by SIGKDD-2023"},{"id":"http://arxiv.org/abs/2308.11257v2","updated":"2023-09-10T17:05:36Z","published":"2023-08-22T08:00:50Z","title":"HopPG: Self-Iterative Program Generation for Multi-Hop Question\n Answering over Heterogeneous Knowledge","summary":" The semantic parsing-based method is an important research branch for\nknowledge-based question answering. It usually generates executable programs\nlean upon the question and then conduct them to reason answers over a knowledge\nbase. Benefit from this inherent mechanism, it has advantages in the\nperformance and the interpretability. However, traditional semantic parsing\nmethods usually generate a complete program before executing it, which\nstruggles with multi-hop question answering over heterogeneous knowledge. On\none hand, generating a complete multi-hop program relies on multiple\nheterogeneous supporting facts, and it is difficult for generators to\nunderstand these facts simultaneously. On the other hand, this way ignores the\nsemantic information of the intermediate answers at each hop, which is\nbeneficial for subsequent generation. To alleviate these challenges, we propose\na self-iterative framework for multi-hop program generation (HopPG) over\nheterogeneous knowledge, which leverages the previous execution results to\nretrieve supporting facts and generate subsequent programs hop by hop. We\nevaluate our model on MMQA-T^2, and the experimental results show that HopPG\noutperforms existing semantic-parsing-based baselines, especially on the\nmulti-hop questions.\n","authors":["Yingyao Wang","Yongwei Zhou","Chaoqun Duan","Junwei Bao","Tiejun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.11257v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05076v1","updated":"2023-09-10T16:55:49Z","published":"2023-09-10T16:55:49Z","title":"An Appraisal-Based Chain-Of-Emotion Architecture for Affective Language\n Model Game Agents","summary":" The development of believable, natural, and interactive digital artificial\nagents is a field of growing interest. Theoretical uncertainties and technical\nbarriers present considerable challenges to the field, particularly with\nregards to developing agents that effectively simulate human emotions. Large\nlanguage models (LLMs) might address these issues by tapping common patterns in\nsituational appraisal. In three empirical experiments, this study tests the\ncapabilities of LLMs to solve emotional intelligence tasks and to simulate\nemotions. It presents and evaluates a new chain-of-emotion architecture for\nemotion simulation within video games, based on psychological appraisal\nresearch. Results show that it outperforms standard LLM architectures on a\nrange of user experience and content analysis metrics. This study therefore\nprovides early evidence of how to construct and test affective agents based on\ncognitive processes represented in language models.\n","authors":["Maximilian Croissant","Madeleine Frister","Guy Schofield","Cade McCall"],"pdf_url":"https://arxiv.org/pdf/2309.05076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18365v2","updated":"2023-09-10T16:37:36Z","published":"2023-05-27T14:17:33Z","title":"What can Large Language Models do in chemistry? A comprehensive\n benchmark on eight tasks","summary":" Large Language Models (LLMs) with strong abilities in natural language\nprocessing tasks have emerged and have been applied in various kinds of areas\nsuch as science, finance and software engineering. However, the capability of\nLLMs to advance the field of chemistry remains unclear. In this paper, rather\nthan pursuing state-of-the-art performance, we aim to evaluate capabilities of\nLLMs in a wide range of tasks across the chemistry domain. We identify three\nkey chemistry-related capabilities including understanding, reasoning and\nexplaining to explore in LLMs and establish a benchmark containing eight\nchemistry tasks. Our analysis draws on widely recognized datasets facilitating\na broad exploration of the capacities of LLMs within the context of practical\nchemistry. Five LLMs (GPT-4, GPT-3.5, Davinci-003, Llama and Galactica) are\nevaluated for each chemistry task in zero-shot and few-shot in-context learning\nsettings with carefully selected demonstration examples and specially crafted\nprompts. Our investigation found that GPT-4 outperformed other models and LLMs\nexhibit different competitive levels in eight chemistry tasks. In addition to\nthe key findings from the comprehensive benchmark analysis, our work provides\ninsights into the limitation of current LLMs and the impact of in-context\nlearning settings on LLMs' performance across various chemistry tasks. The code\nand datasets used in this study are available at\nhttps://github.com/ChemFoundationModels/ChemLLMBench.\n","authors":["Taicheng Guo","Kehan Guo","Bozhao Nan","Zhenwen Liang","Zhichun Guo","Nitesh V. Chawla","Olaf Wiest","Xiangliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.18365v2.pdf","comment":"Add extra LLMs experiments; more baselines and more investigations on\n SELFIES, label interpretation, etc"},{"id":"http://arxiv.org/abs/2309.05044v1","updated":"2023-09-10T14:46:31Z","published":"2023-09-10T14:46:31Z","title":"The Effect of Alignment Objectives on Code-Switching Translation","summary":" One of the things that need to change when it comes to machine translation is\nthe models' ability to translate code-switching content, especially with the\nrise of social media and user-generated content. In this paper, we are\nproposing a way of training a single machine translation model that is able to\ntranslate monolingual sentences from one language to another, along with\ntranslating code-switched sentences to either language. This model can be\nconsidered a bilingual model in the human sense. For better use of parallel\ndata, we generated synthetic code-switched (CSW) data along with an alignment\nloss on the encoder to align representations across languages. Using the WMT14\nEnglish-French (En-Fr) dataset, the trained model strongly outperforms\nbidirectional baselines on code-switched translation while maintaining quality\nfor non-code-switched (monolingual) data.\n","authors":["Mohamed Anwar"],"pdf_url":"https://arxiv.org/pdf/2309.05044v1.pdf","comment":"This paper was originally submitted on 30/06/2022"},{"id":"http://arxiv.org/abs/2309.01940v3","updated":"2023-09-10T13:32:38Z","published":"2023-09-05T04:12:01Z","title":"CodeApex: A Bilingual Programming Evaluation Benchmark for Large\n Language Models","summary":" With the emergence of Large Language Models (LLMs), there has been a\nsignificant improvement in the programming capabilities of models, attracting\ngrowing attention from researchers. We propose CodeApex, a bilingual benchmark\ndataset focusing on the programming comprehension and code generation abilities\nof LLMs. CodeApex comprises three types of multiple-choice questions:\nconceptual understanding, commonsense reasoning, and multi-hop reasoning,\ndesigned to evaluate LLMs on programming comprehension tasks. Additionally,\nCodeApex utilizes algorithmic questions and corresponding test cases to assess\nthe code quality generated by LLMs. We evaluate 14 state-of-the-art LLMs,\nincluding both general-purpose and specialized models. GPT exhibits the best\nprogramming capabilities, achieving approximate accuracies of 50% and 56% on\nthe two tasks, respectively. There is still significant room for improvement in\nprogramming tasks. We hope that CodeApex can serve as a reference for\nevaluating the coding capabilities of LLMs, further promoting their development\nand growth. Datasets are released at https://github.com/APEXLAB/CodeApex.git.\nCodeApex submission website is https://apex.sjtu.edu.cn/codeapex/.\n","authors":["Lingyue Fu","Huacan Chai","Shuang Luo","Kounianhua Du","Weiming Zhang","Longteng Fan","Jiayi Lei","Renting Rui","Jianghao Lin","Yuchen Fang","Yifan Liu","Jingkuan Wang","Siyuan Qi","Kangning Zhang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2309.01940v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2212.09746v4","updated":"2023-09-10T13:31:08Z","published":"2022-12-19T18:59:45Z","title":"Evaluating Human-Language Model Interaction","summary":" Many real-world applications of language models (LMs), such as writing\nassistance and code autocomplete, involve human-LM interaction. However, most\nbenchmarks are non-interactive in that a model produces output without human\ninvolvement. To evaluate human-LM interaction, we develop a new framework,\nHuman-AI Language-based Interaction Evaluation (HALIE), that defines the\ncomponents of interactive systems and dimensions to consider when designing\nevaluation metrics. Compared to standard, non-interactive evaluation, HALIE\ncaptures (i) the interactive process, not only the final output; (ii) the\nfirst-person subjective experience, not just a third-party assessment; and\n(iii) notions of preference beyond quality (e.g., enjoyment and ownership). We\nthen design five tasks to cover different forms of interaction: social\ndialogue, question answering, crossword puzzles, summarization, and metaphor\ngeneration. With four state-of-the-art LMs (three variants of OpenAI's GPT-3\nand AI21 Labs' Jurassic-1), we find that better non-interactive performance\ndoes not always translate to better human-LM interaction. In particular, we\nhighlight three cases where the results from non-interactive and interactive\nmetrics diverge and underscore the importance of human-LM interaction for LM\nevaluation.\n","authors":["Mina Lee","Megha Srivastava","Amelia Hardy","John Thickstun","Esin Durmus","Ashwin Paranjape","Ines Gerard-Ursin","Xiang Lisa Li","Faisal Ladhak","Frieda Rong","Rose E. Wang","Minae Kwon","Joon Sung Park","Hancheng Cao","Tony Lee","Rishi Bommasani","Michael Bernstein","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2212.09746v4.pdf","comment":"Authored by the Center for Research on Foundation Models (CRFM) at\n the Stanford Institute for Human-Centered Artificial Intelligence (HAI)"},{"id":"http://arxiv.org/abs/2306.13047v2","updated":"2023-09-10T13:21:05Z","published":"2023-06-22T17:13:08Z","title":"CamChoice: A Corpus of Multiple Choice Questions and Candidate Response\n Distributions","summary":" Multiple choice exams are widely used to assess candidates across a diverse\nrange of domains and tasks. To moderate question quality, newly proposed\nquestions often pass through pre-test evaluation stages before being deployed\ninto real-world exams. Currently, this evaluation process is manually\nintensive, which can lead to time lags in the question development cycle.\nStreamlining this process via automation can significantly enhance efficiency,\nhowever, there's a current lack of datasets with adequate pre-test analysis\ninformation. In this paper we introduce CamChoice; a multiple-choice\ncomprehension dataset of questions at different target levels, with\ncorresponding candidate selection distributions. We introduce the task of\ncandidate distribution matching, propose several evaluation metrics for the\ntask, and demonstrate that automatic systems trained on RACE++ can be leveraged\nas baselines for our task. We further demonstrate that these automatic systems\ncan be used for practical pre-test evaluation tasks such as detecting\nunderperforming distractors, where our detection systems can automatically\nidentify poor distractors that few candidates select. We release the data\npublicly for future research.\n","authors":["Adian Liusie","Vatsal Raina","Andrew Mullooly","Kate Knill","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2306.13047v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05021v1","updated":"2023-09-10T13:06:45Z","published":"2023-09-10T13:06:45Z","title":"Chat2Brain: A Method for Mapping Open-Ended Semantic Queries to Brain\n Activation Maps","summary":" Over decades, neuroscience has accumulated a wealth of research results in\nthe text modality that can be used to explore cognitive processes.\nMeta-analysis is a typical method that successfully establishes a link from\ntext queries to brain activation maps using these research results, but it\nstill relies on an ideal query environment. In practical applications, text\nqueries used for meta-analyses may encounter issues such as semantic redundancy\nand ambiguity, resulting in an inaccurate mapping to brain images. On the other\nhand, large language models (LLMs) like ChatGPT have shown great potential in\ntasks such as context understanding and reasoning, displaying a high degree of\nconsistency with human natural language. Hence, LLMs could improve the\nconnection between text modality and neuroscience, resolving existing\nchallenges of meta-analyses. In this study, we propose a method called\nChat2Brain that combines LLMs to basic text-2-image model, known as Text2Brain,\nto map open-ended semantic queries to brain activation maps in data-scarce and\ncomplex query environments. By utilizing the understanding and reasoning\ncapabilities of LLMs, the performance of the mapping model is optimized by\ntransferring text queries to semantic queries. We demonstrate that Chat2Brain\ncan synthesize anatomically plausible neural activation patterns for more\ncomplex tasks of text queries.\n","authors":["Yaonai Wei","Tuo Zhang","Han Zhang","Tianyang Zhong","Lin Zhao","Zhengliang Liu","Chong Ma","Songyao Zhang","Muheng Shang","Lei Du","Xiao Li","Tianming Liu","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2309.05021v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.05007v1","updated":"2023-09-10T11:58:29Z","published":"2023-09-10T11:58:29Z","title":"FOLLOWUPQG: Towards Information-Seeking Follow-up Question Generation","summary":" Humans ask follow-up questions driven by curiosity, which reflects a creative\nhuman cognitive process. We introduce the task of real-world\ninformation-seeking follow-up question generation (FQG), which aims to generate\nfollow-up questions seeking a more in-depth understanding of an initial\nquestion and answer. We construct FOLLOWUPQG, a dataset of over 3K real-world\n(initial question, answer, follow-up question) tuples collected from a Reddit\nforum providing layman-friendly explanations for open-ended questions. In\ncontrast to existing datasets, questions in FOLLOWUPQG use more diverse\npragmatic strategies to seek information, and they also show higher-order\ncognitive skills (such as applying and relating). We evaluate current question\ngeneration models on their efficacy for generating follow-up questions,\nexploring how to generate specific types of follow-up questions based on\nstep-by-step demonstrations. Our results validate FOLLOWUPQG as a challenging\nbenchmark, as model-generated questions are adequate but far from human-raised\nquestions in terms of informativeness and complexity.\n","authors":["Yan Meng","Liangming Pan","Yixin Cao","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2309.05007v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04992v1","updated":"2023-09-10T10:57:41Z","published":"2023-09-10T10:57:41Z","title":"Mitigating Word Bias in Zero-shot Prompt-based Classifiers","summary":" Prompt-based classifiers are an attractive approach for zero-shot\nclassification. However, the precise choice of the prompt template and label\nwords can largely influence performance, with semantically equivalent settings\noften showing notable performance difference. This discrepancy can be partly\nattributed to word biases, where the classifier may be biased towards classes.\nTo address this problem, it is possible to optimise classification thresholds\non a labelled data set, however, this mitigates some of the advantages of\nprompt-based classifiers. This paper instead approaches this problem by\nexamining the expected marginal probabilities of the classes. Here,\nprobabilities are reweighted to have a uniform prior over classes, in an\nunsupervised fashion. Further, we draw a theoretical connection between the\nclass priors and the language models' word prior, and offer the ability to set\na threshold in a zero-resource fashion. We show that matching class priors\ncorrelates strongly with the oracle upper bound performance and demonstrate\nlarge consistent performance gains for prompt settings over a range of NLP\ntasks.\n","authors":["Adian Liusie","Potsawee Manakul","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2309.04992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03929v2","updated":"2023-09-10T10:06:22Z","published":"2023-08-07T22:13:30Z","title":"Challenging the Machinery of Generative AI with Fact-Checking:\n Ontology-Driven Biological Graphs for Verifying Human Disease-Gene Links","summary":" Methods: we adopted a biological networks approach that enables the\nsystematic interrogation of ChatGPT's linked entities. In particular, we\ndesigned an ontology-driven fact-checking algorithm that compares biological\ngraphs constructed from approximately 200,000 PubMed abstracts with\ncounterparts constructed from a dataset generated using the ChatGPT-3.5 Turbo\nmodel. The nodes refer to biological entities (genes and diseases) that occur\nin the text. The edges represent the co-occurrence relationships of two\nentities mentioned in the same document, weighted by the proximity distance\nbetween these two entities. This research assumes a ``closed-world\nassumption'', meaning that fact-checking is performed only using the literature\ndataset as our ground truth. Results: in ten samples of 250 randomly selected\nrecords from the ChatGPT dataset of 1000 ``simulated'' articles , the\nfact-checking link accuracy ranged from 70% to 86%, while the remainder of the\nlinks remained unverified. Given the closed world assumption, the fact-checking\nprecision is significant. When measuring and comparing the proximity distances\nof the edges of literature graphs against ChatGPT graphs we found that the\nChatGPT distances were significantly shorter (ranging from 90 to 153) character\ndistance. In contrast, the proximity distance of biological entities identified\nin the literature ranged from 236 to 765 character distance. This pattern held\ntrue for all the relationships among biological entities in the ten samples.\nConclusion: this study demonstrated a reasonably high percentage accuracy of\naggregate fact-checking of disease-gene relationships found in\nChatGPT-generated texts. The strikingly consistent pattern of short proximity\ndistances across all samples offers an illuminating feedback to the biological\nknowledge we possess in the literature today.\n","authors":["Ahmed Abdeen Hamed","Byung Suk Lee","Alessandro Crimi","Magdalena M. Misiak"],"pdf_url":"https://arxiv.org/pdf/2308.03929v2.pdf","comment":"9 Pages, 3 algorithms, 5 tables, and 8 figures"},{"id":"http://arxiv.org/abs/2309.04979v1","updated":"2023-09-10T10:05:03Z","published":"2023-09-10T10:05:03Z","title":"Retrieval-Augmented Meta Learning for Low-Resource Text Classification","summary":" Meta learning have achieved promising performance in low-resource text\nclassification which aims to identify target classes with knowledge transferred\nfrom source classes with sets of small tasks named episodes. However, due to\nthe limited training data in the meta-learning scenario and the inherent\nproperties of parameterized neural networks, poor generalization performance\nhas become a pressing problem that needs to be addressed. To deal with this\nissue, we propose a meta-learning based method called Retrieval-Augmented Meta\nLearning(RAML). It not only uses parameterization for inference but also\nretrieves non-parametric knowledge from an external corpus to make inferences,\nwhich greatly alleviates the problem of poor generalization performance caused\nby the lack of diverse training data in meta-learning. This method differs from\nprevious models that solely rely on parameters, as it explicitly emphasizes the\nimportance of non-parametric knowledge, aiming to strike a balance between\nparameterized neural networks and non-parametric knowledge. The model is\nrequired to determine which knowledge to access and utilize during inference.\nAdditionally, our multi-view passages fusion network module can effectively and\nefficiently integrate the retrieved information into low-resource\nclassification task. The extensive experiments demonstrate that RAML\nsignificantly outperforms current SOTA low-resource text classification models.\n","authors":["Rongsheng Li","Yangning Li","Yinghui Li","Chaiyut Luoyiching","Hai-Tao Zheng","Nannan Zhou","Hanjing Su"],"pdf_url":"https://arxiv.org/pdf/2309.04979v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2309.04977v1","updated":"2023-09-10T09:46:38Z","published":"2023-09-10T09:46:38Z","title":"RGAT: A Deeper Look into Syntactic Dependency Information for\n Coreference Resolution","summary":" Although syntactic information is beneficial for many NLP tasks, combining it\nwith contextual information between words to solve the coreference resolution\nproblem needs to be further explored. In this paper, we propose an end-to-end\nparser that combines pre-trained BERT with a Syntactic Relation Graph Attention\nNetwork (RGAT) to take a deeper look into the role of syntactic dependency\ninformation for the coreference resolution task. In particular, the RGAT model\nis first proposed, then used to understand the syntactic dependency graph and\nlearn better task-specific syntactic embeddings. An integrated architecture\nincorporating BERT embeddings and syntactic embeddings is constructed to\ngenerate blending representations for the downstream task. Our experiments on a\npublic Gendered Ambiguous Pronouns (GAP) dataset show that with the supervision\nlearning of the syntactic dependency graph and without fine-tuning the entire\nBERT, we increased the F1-score of the previous best model (RGCN-with-BERT)\nfrom 80.3% to 82.5%, compared to the F1-score by single BERT embeddings from\n78.5% to 82.5%. Experimental results on another public dataset - OntoNotes 5.0\ndemonstrate that the performance of the model is also improved by incorporating\nsyntactic dependency information learned from RGAT.\n","authors":["Yuan Meng","Xuhao Pan","Jun Chang","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2309.04977v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.04971v1","updated":"2023-09-10T09:16:38Z","published":"2023-09-10T09:16:38Z","title":"Prompt Learning With Knowledge Memorizing Prototypes For Generalized\n Few-Shot Intent Detection","summary":" Generalized Few-Shot Intent Detection (GFSID) is challenging and realistic\nbecause it needs to categorize both seen and novel intents simultaneously.\nPrevious GFSID methods rely on the episodic learning paradigm, which makes it\nhard to extend to a generalized setup as they do not explicitly learn the\nclassification of seen categories and the knowledge of seen intents. To address\nthe dilemma, we propose to convert the GFSID task into the class incremental\nlearning paradigm. Specifically, we propose a two-stage learning framework,\nwhich sequentially learns the knowledge of different intents in various periods\nvia prompt learning. And then we exploit prototypes for categorizing both seen\nand novel intents. Furthermore, to achieve the transfer knowledge of intents in\ndifferent stages, for different scenarios we design two knowledge preservation\nmethods which close to realistic applications. Extensive experiments and\ndetailed analyses on two widely used datasets show that our framework based on\nthe class incremental learning paradigm achieves promising performance.\n","authors":["Chaiyut Luoyiching","Yangning Li","Yinghui Li","Rongsheng Li","Hai-Tao Zheng","Nannan Zhou","Hanjing Su"],"pdf_url":"https://arxiv.org/pdf/2309.04971v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2309.04965v1","updated":"2023-09-10T08:55:24Z","published":"2023-09-10T08:55:24Z","title":"Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image\n Captioning","summary":" While impressive performance has been achieved in image captioning, the\nlimited diversity of the generated captions and the large parameter scale\nremain major barriers to the real-word application of these systems. In this\nwork, we propose a lightweight image captioning network in combination with\ncontinuous diffusion, called Prefix-diffusion. To achieve diversity, we design\nan efficient method that injects prefix image embeddings into the denoising\nprocess of the diffusion model. In order to reduce trainable parameters, we\nemploy a pre-trained model to extract image features and further design an\nextra mapping network. Prefix-diffusion is able to generate diverse captions\nwith relatively less parameters, while maintaining the fluency and relevance of\nthe captions benefiting from the generative capabilities of the diffusion\nmodel. Our work paves the way for scaling up diffusion models for image\ncaptioning, and achieves promising performance compared with recent approaches.\n","authors":["Guisheng Liu","Yi Li","Zhengcong Fei","Haiyan Fu","Xiangyang Luo","Yanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2309.04965v1.pdf","comment":"11 pages,4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.04951v1","updated":"2023-09-10T07:43:42Z","published":"2023-09-10T07:43:42Z","title":"Multi-document Summarization: A Comparative Evaluation","summary":" This paper is aimed at evaluating state-of-the-art models for Multi-document\nSummarization (MDS) on different types of datasets in various domains and\ninvestigating the limitations of existing models to determine future research\ndirections. To address this gap, we conducted an extensive literature review to\nidentify state-of-the-art models and datasets. We analyzed the performance of\nPRIMERA and PEGASUS models on BigSurvey-MDS and MS$^2$ datasets, which posed\nunique challenges due to their varied domains. Our findings show that the\nGeneral-Purpose Pre-trained Model LED outperforms PRIMERA and PEGASUS on the\nMS$^2$ dataset. We used the ROUGE score as a performance metric to evaluate the\nidentified models on different datasets. Our study provides valuable insights\ninto the models' strengths and weaknesses, as well as their applicability in\ndifferent domains. This work serves as a reference for future MDS research and\ncontributes to the development of accurate and robust models which can be\nutilized on demanding datasets with academically and/or scientifically complex\ndata as well as generalized, relatively simple datasets.\n","authors":["Kushan Hewapathirana","Nisansa de Silva","C. D. Athuraliya"],"pdf_url":"https://arxiv.org/pdf/2309.04951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04940v1","updated":"2023-09-10T06:10:03Z","published":"2023-09-10T06:10:03Z","title":"What's Hard in English RST Parsing? Predictive Models for Error Analysis","summary":" Despite recent advances in Natural Language Processing (NLP), hierarchical\ndiscourse parsing in the framework of Rhetorical Structure Theory remains\nchallenging, and our understanding of the reasons for this are as yet limited.\nIn this paper, we examine and model some of the factors associated with parsing\ndifficulties in previous work: the existence of implicit discourse relations,\nchallenges in identifying long-distance relations, out-of-vocabulary items, and\nmore. In order to assess the relative importance of these variables, we also\nrelease two annotated English test-sets with explicit correct and distracting\ndiscourse markers associated with gold standard RST relations. Our results show\nthat as in shallow discourse parsing, the explicit/implicit distinction plays a\nrole, but that long-distance dependencies are the main challenge, while lack of\nlexical overlap is less of a problem, at least for in-domain parsing. Our final\nmodel is able to predict where errors will occur with an accuracy of 76.3% for\nthe bottom-up parser and 76.6% for the top-down parser.\n","authors":["Yang Janet Liu","Tatsuya Aoyama","Amir Zeldes"],"pdf_url":"https://arxiv.org/pdf/2309.04940v1.pdf","comment":"SIGDIAL 2023 camera-ready; 12 pages"},{"id":"http://arxiv.org/abs/2309.04919v1","updated":"2023-09-10T02:55:12Z","published":"2023-09-10T02:55:12Z","title":"Unsupervised Chunking with Hierarchical RNN","summary":" In Natural Language Processing (NLP), predicting linguistic structures, such\nas parsing and chunking, has mostly relied on manual annotations of syntactic\nstructures. This paper introduces an unsupervised approach to chunking, a\nsyntactic task that involves grouping words in a non-hierarchical manner. We\npresent a two-layer Hierarchical Recurrent Neural Network (HRNN) designed to\nmodel word-to-chunk and chunk-to-sentence compositions. Our approach involves a\ntwo-stage training process: pretraining with an unsupervised parser and\nfinetuning on downstream NLP tasks. Experiments on the CoNLL-2000 dataset\nreveal a notable improvement over existing unsupervised methods, enhancing\nphrase F1 score by up to 6 percentage points. Further, finetuning with\ndownstream tasks results in an additional performance improvement.\nInterestingly, we observe that the emergence of the chunking structure is\ntransient during the neural model's downstream-task training. This study\ncontributes to the advancement of unsupervised syntactic structure discovery\nand opens avenues for further research in linguistic theory.\n","authors":["Zijun Wu","Anup Anand Deshmukh","Yongkang Wu","Jimmy Lin","Lili Mou"],"pdf_url":"https://arxiv.org/pdf/2309.04919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.00269v4","updated":"2023-09-10T02:46:02Z","published":"2021-10-01T08:51:58Z","title":"A Survey of Knowledge Enhanced Pre-trained Models","summary":" Pre-trained language models learn informative word representations on a\nlarge-scale text corpus through self-supervised learning, which has achieved\npromising performance in fields of natural language processing (NLP) after\nfine-tuning. These models, however, suffer from poor robustness and lack of\ninterpretability. We refer to pre-trained language models with knowledge\ninjection as knowledge-enhanced pre-trained language models (KEPLMs). These\nmodels demonstrate deep understanding and logical reasoning and introduce\ninterpretability. In this survey, we provide a comprehensive overview of KEPLMs\nin NLP. We first discuss the advancements in pre-trained language models and\nknowledge representation learning. Then we systematically categorize existing\nKEPLMs from three different perspectives. Finally, we outline some potential\ndirections of KEPLMs for future research.\n","authors":["Jian Yang","Xinyu Hu","Gang Xiao","Yulong Shen"],"pdf_url":"https://arxiv.org/pdf/2110.00269v4.pdf","comment":"32 pages, 15 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2203.13278v3","updated":"2023-09-10T23:16:47Z","published":"2022-03-24T18:11:31Z","title":"Practical Blind Image Denoising via Swin-Conv-UNet and Data Synthesis","summary":" While recent years have witnessed a dramatic upsurge of exploiting deep\nneural networks toward solving image denoising, existing methods mostly rely on\nsimple noise assumptions, such as additive white Gaussian noise (AWGN), JPEG\ncompression noise and camera sensor noise, and a general-purpose blind\ndenoising method for real images remains unsolved. In this paper, we attempt to\nsolve this problem from the perspective of network architecture design and\ntraining data synthesis. Specifically, for the network architecture design, we\npropose a swin-conv block to incorporate the local modeling ability of residual\nconvolutional layer and non-local modeling ability of swin transformer block,\nand then plug it as the main building block into the widely-used image-to-image\ntranslation UNet architecture. For the training data synthesis, we design a\npractical noise degradation model which takes into consideration different\nkinds of noise (including Gaussian, Poisson, speckle, JPEG compression, and\nprocessed camera sensor noises) and resizing, and also involves a random\nshuffle strategy and a double degradation strategy. Extensive experiments on\nAGWN removal and real image denoising demonstrate that the new network\narchitecture design achieves state-of-the-art performance and the new\ndegradation model can help to significantly improve the practicability. We\nbelieve our work can provide useful insights into current denoising research.\n","authors":["Kai Zhang","Yawei Li","Jingyun Liang","Jiezhang Cao","Yulun Zhang","Hao Tang","Deng-Ping Fan","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2203.13278v3.pdf","comment":"Codes: https://github.com/cszn/SCUNet"},{"id":"http://arxiv.org/abs/2309.05162v1","updated":"2023-09-10T23:00:35Z","published":"2023-09-10T23:00:35Z","title":"Collecting Visually-Grounded Dialogue with A Game Of Sorts","summary":" An idealized, though simplistic, view of the referring expression production\nand grounding process in (situated) dialogue assumes that a speaker must merely\nappropriately specify their expression so that the target referent may be\nsuccessfully identified by the addressee. However, referring in conversation is\na collaborative process that cannot be aptly characterized as an exchange of\nminimally-specified referring expressions. Concerns have been raised regarding\nassumptions made by prior work on visually-grounded dialogue that reveal an\noversimplified view of conversation and the referential process. We address\nthese concerns by introducing a collaborative image ranking task, a grounded\nagreement game we call \"A Game Of Sorts\". In our game, players are tasked with\nreaching agreement on how to rank a set of images given some sorting criterion\nthrough a largely unrestricted, role-symmetric dialogue. By putting emphasis on\nthe argumentation in this mixed-initiative interaction, we collect discussions\nthat involve the collaborative referential process. We describe results of a\nsmall-scale data collection experiment with the proposed task. All discussed\nmaterials, which includes the collected data, the codebase, and a containerized\nversion of the application, are publicly available.\n","authors":["Bram Willemsen","Dmytro Kalpakchi","Gabriel Skantze"],"pdf_url":"https://arxiv.org/pdf/2309.05162v1.pdf","comment":"Published at LREC 2022"},{"id":"http://arxiv.org/abs/2207.11581v2","updated":"2023-09-10T22:45:59Z","published":"2022-07-23T19:17:26Z","title":"Self-supervised contrastive learning of echocardiogram videos enables\n label-efficient cardiac disease diagnosis","summary":" Advances in self-supervised learning (SSL) have shown that self-supervised\npretraining on medical imaging data can provide a strong initialization for\ndownstream supervised classification and segmentation. Given the difficulty of\nobtaining expert labels for medical image recognition tasks, such an\n\"in-domain\" SSL initialization is often desirable due to its improved label\nefficiency over standard transfer learning. However, most efforts toward SSL of\nmedical imaging data are not adapted to video-based medical imaging modalities.\nWith this progress in mind, we developed a self-supervised contrastive learning\napproach, EchoCLR, catered to echocardiogram videos with the goal of learning\nstrong representations for efficient fine-tuning on downstream cardiac disease\ndiagnosis. EchoCLR leverages (i) distinct videos of the same patient as\npositive pairs for contrastive learning and (ii) a frame re-ordering pretext\ntask to enforce temporal coherence. When fine-tuned on small portions of\nlabeled data (as few as 51 exams), EchoCLR pretraining significantly improved\nclassification performance for left ventricular hypertrophy (LVH) and aortic\nstenosis (AS) over other transfer learning and SSL approaches across internal\nand external test sets. For example, when fine-tuning on 10% of available\ntraining data (519 studies), an EchoCLR-pretrained model achieved 0.72 AUROC\n(95% CI: [0.69, 0.75]) on LVH classification, compared to 0.61 AUROC (95% CI:\n[0.57, 0.64]) with a standard transfer learning approach. Similarly, using 1%\nof available training data (53 studies), EchoCLR pretraining achieved 0.82\nAUROC (95% CI: [0.79, 0.84]) on severe AS classification, compared to 0.61\nAUROC (95% CI: [0.58, 0.65]) with transfer learning. EchoCLR is unique in its\nability to learn representations of medical videos and demonstrates that SSL\ncan enable label-efficient disease classification from small, labeled datasets.\n","authors":["Gregory Holste","Evangelos K. Oikonomou","Bobak J. Mortazavi","Zhangyang Wang","Rohan Khera"],"pdf_url":"https://arxiv.org/pdf/2207.11581v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05150v1","updated":"2023-09-10T21:54:03Z","published":"2023-09-10T21:54:03Z","title":"Faster, Lighter, More Accurate: A Deep Learning Ensemble for Content\n Moderation","summary":" To address the increasing need for efficient and accurate content moderation,\nwe propose an efficient and lightweight deep classification ensemble structure.\nOur approach is based on a combination of simple visual features, designed for\nhigh-accuracy classification of violent content with low false positives. Our\nensemble architecture utilizes a set of lightweight models with narrowed-down\ncolor features, and we apply it to both images and videos.\n We evaluated our approach using a large dataset of explosion and blast\ncontents and compared its performance to popular deep learning models such as\nResNet-50. Our evaluation results demonstrate significant improvements in\nprediction accuracy, while benefiting from 7.64x faster inference and lower\ncomputation cost.\n While our approach is tailored to explosion detection, it can be applied to\nother similar content moderation and violence detection use cases as well.\nBased on our experiments, we propose a \"think small, think many\" philosophy in\nclassification scenarios. We argue that transforming a single, large,\nmonolithic deep model into a verification-based step model ensemble of multiple\nsmall, simple, and lightweight models with narrowed-down visual features can\npossibly lead to predictions with higher accuracy.\n","authors":["Mohammad Hosseini","Mahmudul Hasan"],"pdf_url":"https://arxiv.org/pdf/2309.05150v1.pdf","comment":"6 pages, 22nd IEEE International Conference on Machine Learning and\n Applications (IEEE ICMLA'23), December 15-17, 2023, Jacksonville Riverfront,\n Florida, USA. arXiv admin note: substantial text overlap with\n arXiv:2103.10350"},{"id":"http://arxiv.org/abs/2309.05148v1","updated":"2023-09-10T21:52:47Z","published":"2023-09-10T21:52:47Z","title":"Beyond Skin Tone: A Multidimensional Measure of Apparent Skin Color","summary":" This paper strives to measure apparent skin color in computer vision, beyond\na unidimensional scale on skin tone. In their seminal paper Gender Shades,\nBuolamwini and Gebru have shown how gender classification systems can be biased\nagainst women with darker skin tones. Subsequently, fairness researchers and\npractitioners have adopted the Fitzpatrick skin type classification as a common\nmeasure to assess skin color bias in computer vision systems. While effective,\nthe Fitzpatrick scale only focuses on the skin tone ranging from light to dark.\nTowards a more comprehensive measure of skin color, we introduce the hue angle\nranging from red to yellow. When applied to images, the hue dimension reveals\nadditional biases related to skin color in both computer vision datasets and\nmodels. We then recommend multidimensional skin color scales, relying on both\nskin tone and hue, for fairness assessments.\n","authors":["William Thong","Przemyslaw Joniak","Alice Xiang"],"pdf_url":"https://arxiv.org/pdf/2309.05148v1.pdf","comment":"Accepted at the International Conference on Computer Vision (ICCV)\n 2023"},{"id":"http://arxiv.org/abs/2309.05139v1","updated":"2023-09-10T21:16:56Z","published":"2023-09-10T21:16:56Z","title":"A Skeleton-based Approach For Rock Crack Detection Towards A Climbing\n Robot Application","summary":" Conventional wheeled robots are unable to traverse scientifically\ninteresting, but dangerous, cave environments. Multi-limbed climbing robot\ndesigns, such as ReachBot, are able to grasp irregular surface features and\nexecute climbing motions to overcome obstacles, given suitable grasp locations.\nTo support grasp site identification, we present a method for detecting rock\ncracks and edges, the SKeleton Intersection Loss (SKIL). SKIL is a loss\ndesigned for thin object segmentation that leverages the skeleton of the label.\nA dataset of rock face images was collected, manually annotated, and augmented\nwith generated data. A new group of metrics, LineAcc, has been proposed for\nthin object segmentation such that the impact of the object width on the score\nis minimized. In addition, the metric is less sensitive to translation which\ncan often lead to a score of zero when computing classical metrics such as Dice\non thin objects. Our fine-tuned models outperform previous methods on similar\nthin object segmentation tasks such as blood vessel segmentation and show\npromise for integration onto a robotic system.\n","authors":["Josselin Somerville Roberts","Paul-Emile Giacomelli","Yoni Gozlan","Julia Di"],"pdf_url":"https://arxiv.org/pdf/2309.05139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05132v1","updated":"2023-09-10T20:39:53Z","published":"2023-09-10T20:39:53Z","title":"DAD++: Improved Data-free Test Time Adversarial Defense","summary":" With the increasing deployment of deep neural networks in safety-critical\napplications such as self-driving cars, medical imaging, anomaly detection,\netc., adversarial robustness has become a crucial concern in the reliability of\nthese networks in real-world scenarios. A plethora of works based on\nadversarial training and regularization-based techniques have been proposed to\nmake these deep networks robust against adversarial attacks. However, these\nmethods require either retraining models or training them from scratch, making\nthem infeasible to defend pre-trained models when access to training data is\nrestricted. To address this problem, we propose a test time Data-free\nAdversarial Defense (DAD) containing detection and correction frameworks.\nMoreover, to further improve the efficacy of the correction framework in cases\nwhen the detector is under-confident, we propose a soft-detection scheme\n(dubbed as \"DAD++\"). We conduct a wide range of experiments and ablations on\nseveral datasets and network architectures to show the efficacy of our proposed\napproach. Furthermore, we demonstrate the applicability of our approach in\nimparting adversarial defense at test time under data-free (or data-efficient)\napplications/setups, such as Data-free Knowledge Distillation and Source-free\nUnsupervised Domain Adaptation, as well as Semi-supervised classification\nframeworks. We observe that in all the experiments and applications, our DAD++\ngives an impressive performance against various adversarial attacks with a\nminimal drop in clean accuracy. The source code is available at:\nhttps://github.com/vcl-iisc/Improved-Data-free-Test-Time-Adversarial-Defense\n","authors":["Gaurav Kumar Nayak","Inder Khatri","Shubham Randive","Ruchit Rawal","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2309.05132v1.pdf","comment":"IJCV Journal (Under Review)"},{"id":"http://arxiv.org/abs/2304.13000v3","updated":"2023-09-10T19:42:02Z","published":"2023-04-25T17:14:36Z","title":"Segment anything, from space?","summary":" Recently, the first foundation model developed specifically for image\nsegmentation tasks was developed, termed the \"Segment Anything Model\" (SAM).\nSAM can segment objects in input imagery based on cheap input prompts, such as\none (or more) points, a bounding box, or a mask. The authors examined the\n\\textit{zero-shot} image segmentation accuracy of SAM on a large number of\nvision benchmark tasks and found that SAM usually achieved recognition accuracy\nsimilar to, or sometimes exceeding, vision models that had been trained on the\ntarget tasks. The impressive generalization of SAM for segmentation has major\nimplications for vision researchers working on natural imagery. In this work,\nwe examine whether SAM's performance extends to overhead imagery problems and\nhelp guide the community's response to its development. We examine SAM's\nperformance on a set of diverse and widely studied benchmark tasks. We find\nthat SAM does often generalize well to overhead imagery, although it fails in\nsome cases due to the unique characteristics of overhead imagery and its common\ntarget objects. We report on these unique systematic failure cases for remote\nsensing imagery that may comprise useful future research for the community.\n","authors":["Simiao Ren","Francesco Luzi","Saad Lahrichi","Kaleb Kassaw","Leslie M. Collins","Kyle Bradbury","Jordan M. Malof"],"pdf_url":"https://arxiv.org/pdf/2304.13000v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05098v1","updated":"2023-09-10T17:59:48Z","published":"2023-09-10T17:59:48Z","title":"3D Implicit Transporter for Temporally Consistent Keypoint Discovery","summary":" Keypoint-based representation has proven advantageous in various visual and\nrobotic tasks. However, the existing 2D and 3D methods for detecting keypoints\nmainly rely on geometric consistency to achieve spatial alignment, neglecting\ntemporal consistency. To address this issue, the Transporter method was\nintroduced for 2D data, which reconstructs the target frame from the source\nframe to incorporate both spatial and temporal information. However, the direct\napplication of the Transporter to 3D point clouds is infeasible due to their\nstructural differences from 2D images. Thus, we propose the first 3D version of\nthe Transporter, which leverages hybrid 3D representation, cross attention, and\nimplicit reconstruction. We apply this new learning system on 3D articulated\nobjects and nonrigid animals (humans and rodents) and show that learned\nkeypoints are spatio-temporally consistent. Additionally, we propose a\nclosed-loop control strategy that utilizes the learned keypoints for 3D object\nmanipulation and demonstrate its superior performance. Codes are available at\nhttps://github.com/zhongcl-thu/3D-Implicit-Transporter.\n","authors":["Chengliang Zhong","Yuhang Zheng","Yupeng Zheng","Hao Zhao","Li Yi","Xiaodong Mu","Ling Wang","Pengfei Li","Guyue Zhou","Chao Yang","Xinliang Zhang","Jian Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05098v1.pdf","comment":"ICCV2023 oral paper"},{"id":"http://arxiv.org/abs/2309.05095v1","updated":"2023-09-10T17:41:46Z","published":"2023-09-10T17:41:46Z","title":"MaskRenderer: 3D-Infused Multi-Mask Realistic Face Reenactment","summary":" We present a novel end-to-end identity-agnostic face reenactment system,\nMaskRenderer, that can generate realistic, high fidelity frames in real-time.\nAlthough recent face reenactment works have shown promising results, there are\nstill significant challenges such as identity leakage and imitating mouth\nmovements, especially for large pose changes and occluded faces. MaskRenderer\ntackles these problems by using (i) a 3DMM to model 3D face structure to better\nhandle pose changes, occlusion, and mouth movements compared to 2D\nrepresentations; (ii) a triplet loss function to embed the cross-reenactment\nduring training for better identity preservation; and (iii) multi-scale\nocclusion, improving inpainting and restoring missing areas. Comprehensive\nquantitative and qualitative experiments conducted on the VoxCeleb1 test set,\ndemonstrate that MaskRenderer outperforms state-of-the-art models on unseen\nfaces, especially when the Source and Driving identities are very different.\n","authors":["Tina Behrouzi","Atefeh Shahroudnejad","Payam Mousavi"],"pdf_url":"https://arxiv.org/pdf/2309.05095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05090v1","updated":"2023-09-10T17:34:14Z","published":"2023-09-10T17:34:14Z","title":"Sculpting Efficiency: Pruning Medical Imaging Models for On-Device\n Inference","summary":" Applying ML advancements to healthcare can improve patient outcomes. However,\nthe sheer operational complexity of ML models, combined with legacy hardware\nand multi-modal gigapixel images, poses a severe deployment limitation for\nreal-time, on-device inference. We consider filter pruning as a solution,\nexploring segmentation models in cardiology and ophthalmology. Our preliminary\nresults show a compression rate of up to 1148x with minimal loss in quality,\nstressing the need to consider task complexity and architectural details when\nusing off-the-shelf models. At high compression rates, filter-pruned models\nexhibit faster inference on a CPU than the GPU baseline. We also demonstrate\nthat such models' robustness and generalisability characteristics exceed that\nof the baseline and weight-pruned counterparts. We uncover intriguing questions\nand take a step towards realising cost-effective disease diagnosis, monitoring,\nand preventive solutions.\n","authors":["Sudarshan Sreeram","Bernhard Kainz"],"pdf_url":"https://arxiv.org/pdf/2309.05090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17155v3","updated":"2023-09-10T17:33:30Z","published":"2023-03-30T05:25:20Z","title":"Discriminative Class Tokens for Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image diffusion models have enabled the generation\nof diverse and high-quality images. While impressive, the images often fall\nshort of depicting subtle details and are susceptible to errors due to\nambiguity in the input text. One way of alleviating these issues is to train\ndiffusion models on class-labeled datasets. This approach has two\ndisadvantages: (i) supervised datasets are generally small compared to\nlarge-scale scraped text-image datasets on which text-to-image models are\ntrained, affecting the quality and diversity of the generated images, or (ii)\nthe input is a hard-coded label, as opposed to free-form text, limiting the\ncontrol over the generated images.\n In this work, we propose a non-invasive fine-tuning technique that\ncapitalizes on the expressive potential of free-form text while achieving high\naccuracy through discriminative signals from a pretrained classifier. This is\ndone by iteratively modifying the embedding of an added input token of a\ntext-to-image diffusion model, by steering generated images toward a given\ntarget class according to a classifier. Our method is fast compared to prior\nfine-tuning methods and does not require a collection of in-class images or\nretraining of a noise-tolerant classifier. We evaluate our method extensively,\nshowing that the generated images are: (i) more accurate and of higher quality\nthan standard diffusion models, (ii) can be used to augment training data in a\nlow-resource setting, and (iii) reveal information about the data used to train\nthe guiding classifier. The code is available at\n\\url{https://github.com/idansc/discriminative_class_tokens}.\n","authors":["Idan Schwartz","Vésteinn Snæbjarnarson","Hila Chefer","Ryan Cotterell","Serge Belongie","Lior Wolf","Sagie Benaim"],"pdf_url":"https://arxiv.org/pdf/2303.17155v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.01007v2","updated":"2023-09-10T17:10:05Z","published":"2023-09-02T19:02:50Z","title":"Comparative Analysis of Deep Learning Architectures for Breast Cancer\n Diagnosis Using the BreaKHis Dataset","summary":" Cancer is an extremely difficult and dangerous health problem because it\nmanifests in so many different ways and affects so many different organs and\ntissues. The primary goal of this research was to evaluate deep learning\nmodels' ability to correctly identify breast cancer cases using the BreakHis\ndataset. The BreakHis dataset covers a wide range of breast cancer subtypes\nthrough its huge collection of histopathological pictures. In this study, we\nuse and compare the performance of five well-known deep learning models for\ncancer classification: VGG, ResNet, Xception, Inception, and InceptionResNet.\nThe results placed the Xception model at the top, with an F1 score of 0.9 and\nan accuracy of 89%. At the same time, the Inception and InceptionResNet models\nboth hit accuracy of 87% . However, the F1 score for the Inception model was\n87, while that for the InceptionResNet model was 86. These results demonstrate\nthe importance of deep learning methods in making correct breast cancer\ndiagnoses. This highlights the potential to provide improved diagnostic\nservices to patients. The findings of this study not only improve current\nmethods of cancer diagnosis, but also make significant contributions to the\ncreation of new and improved cancer treatment strategies. In a nutshell, the\nresults of this study represent a major advancement in the direction of\nachieving these vital healthcare goals.\n","authors":["İrem Sayın","Muhammed Ali Soydaş","Yunus Emre Mert","Arda Yarkataş","Berk Ergun","Selma Sözen Yeh","Hüseyin Üvet"],"pdf_url":"https://arxiv.org/pdf/2309.01007v2.pdf","comment":"7 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2307.01666v2","updated":"2023-09-10T17:09:47Z","published":"2023-07-04T11:57:23Z","title":"Sensors and Systems for Monitoring Mental Fatigue: A systematic review","summary":" Mental fatigue is a leading cause of motor vehicle accidents, medical errors,\nloss of workplace productivity, and student disengagements in e-learning\nenvironment. Development of sensors and systems that can reliably track mental\nfatigue can prevent accidents, reduce errors, and help increase workplace\nproductivity. This review provides a critical summary of theoretical models of\nmental fatigue, a description of key enabling sensor technologies, and a\nsystematic review of recent studies using biosensor-based systems for tracking\nmental fatigue in humans. We conducted a systematic search and review of recent\nliterature which focused on detection and tracking of mental fatigue in humans.\nThe search yielded 57 studies (N=1082), majority of which used\nelectroencephalography (EEG) based sensors for tracking mental fatigue. We\nfound that EEG-based sensors can provide a moderate to good sensitivity for\nfatigue detection. Notably, we found no incremental benefit of using\nhigh-density EEG sensors for application in mental fatigue detection. Given the\nfindings, we provide a critical discussion on the integration of wearable EEG\nand ambient sensors in the context of achieving real-world monitoring. Future\nwork required to advance and adapt the technologies toward widespread\ndeployment of wearable sensors and systems for fatigue monitoring in\nsemi-autonomous and autonomous industries is examined.\n","authors":["Prabin Sharma","Joanna C. Justus","Megha Thapa","Govinda R. Poudel"],"pdf_url":"https://arxiv.org/pdf/2307.01666v2.pdf","comment":"19 Pages, 3 Figures"},{"id":"http://arxiv.org/abs/2309.05073v1","updated":"2023-09-10T16:42:11Z","published":"2023-09-10T16:42:11Z","title":"FreeMan: Towards Benchmarking 3D Human Pose Estimation in the Wild","summary":" Estimating the 3D structure of the human body from natural scenes is a\nfundamental aspect of visual perception. This task carries great importance for\nfields like AIGC and human-robot interaction. In practice, 3D human pose\nestimation in real-world settings is a critical initial step in solving this\nproblem. However, the current datasets, often collected under controlled\nlaboratory conditions using complex motion capture equipment and unvarying\nbackgrounds, are insufficient. The absence of real-world datasets is stalling\nthe progress of this crucial task. To facilitate the development of 3D pose\nestimation, we present FreeMan, the first large-scale, real-world multi-view\ndataset. FreeMan was captured by synchronizing 8 smartphones across diverse\nscenarios. It comprises 11M frames from 8000 sequences, viewed from different\nperspectives. These sequences cover 40 subjects across 10 different scenarios,\neach with varying lighting conditions. We have also established an automated,\nprecise labeling pipeline that allows for large-scale processing efficiently.\nWe provide comprehensive evaluation baselines for a range of tasks, underlining\nthe significant challenges posed by FreeMan. Further evaluations of standard\nindoor/outdoor human sensing datasets reveal that FreeMan offers robust\nrepresentation transferability in real and complex scenes. FreeMan is now\npublicly available at https://wangjiongw.github.io/freeman.\n","authors":["Jiong Wang","Fengyu Yang","Wenbo Gou","Bingliang Li","Danqi Yan","Ailing Zeng","Yijun Gao","Junle Wang","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05073v1.pdf","comment":"18 pages, 9 figures. Project page:\n https://wangjiongw.github.io/freeman/;\n https://github.com/wangjiongw/FreeMan_API"},{"id":"http://arxiv.org/abs/2309.05071v1","updated":"2023-09-10T16:32:02Z","published":"2023-09-10T16:32:02Z","title":"Super-Resolution Surface Reconstruction from Few Low-Resolution Slices","summary":" In many imaging applications where segmented features (e.g. blood vessels)\nare further used for other numerical simulations (e.g. finite element\nanalysis), the obtained surfaces do not have fine resolutions suitable for the\ntask. Increasing the resolution of such surfaces becomes crucial. This paper\nproposes a new variational model for solving this problem, based on an\nEuler-Elastica-based regulariser. Further, we propose and implement two\nnumerical algorithms for solving the model, a projected gradient descent method\nand the alternating direction method of multipliers. Numerical experiments\nusing real-life examples (including two from outputs of another variational\nmodel) have been illustrated for effectiveness. The advantages of the new model\nare shown through quantitative comparisons by the standard deviation of\nGaussian curvatures and mean curvatures from the viewpoint of discrete\ngeometry.\n","authors":["Yiyao Zhang","Ke Chen","Shang-Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2309.05071v1.pdf","comment":"33 pages, 25 figures"},{"id":"http://arxiv.org/abs/2309.05069v1","updated":"2023-09-10T16:27:54Z","published":"2023-09-10T16:27:54Z","title":"Exploiting CLIP for Zero-shot HOI Detection Requires Knowledge\n Distillation at Multiple Levels","summary":" In this paper, we investigate the task of zero-shot human-object interaction\n(HOI) detection, a novel paradigm for identifying HOIs without the need for\ntask-specific annotations. To address this challenging task, we employ CLIP, a\nlarge-scale pre-trained vision-language model (VLM), for knowledge distillation\non multiple levels. Specifically, we design a multi-branch neural network that\nleverages CLIP for learning HOI representations at various levels, including\nglobal images, local union regions encompassing human-object pairs, and\nindividual instances of humans or objects. To train our model, CLIP is utilized\nto generate HOI scores for both global images and local union regions that\nserve as supervision signals. The extensive experiments demonstrate the\neffectiveness of our novel multi-level CLIP knowledge integration strategy.\nNotably, the model achieves strong performance, which is even comparable with\nsome fully-supervised and weakly-supervised methods on the public HICO-DET\nbenchmark.\n","authors":["Bo Wan","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2309.05069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.11989v2","updated":"2023-09-10T15:18:03Z","published":"2023-03-21T16:21:02Z","title":"Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models","summary":" We present Text2Room, a method for generating room-scale textured 3D meshes\nfrom a given text prompt as input. To this end, we leverage pre-trained 2D\ntext-to-image models to synthesize a sequence of images from different poses.\nIn order to lift these outputs into a consistent 3D scene representation, we\ncombine monocular depth estimation with a text-conditioned inpainting model.\nThe core idea of our approach is a tailored viewpoint selection such that the\ncontent of each image can be fused into a seamless, textured 3D mesh. More\nspecifically, we propose a continuous alignment strategy that iteratively fuses\nscene frames with the existing geometry to create a seamless mesh. Unlike\nexisting works that focus on generating single objects or zoom-out trajectories\nfrom text, our method generates complete 3D scenes with multiple objects and\nexplicit 3D geometry. We evaluate our approach using qualitative and\nquantitative metrics, demonstrating it as the first method to generate\nroom-scale 3D geometry with compelling textures from only text as input.\n","authors":["Lukas Höllein","Ang Cao","Andrew Owens","Justin Johnson","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2303.11989v2.pdf","comment":"Accepted to ICCV 2023 (Oral) video: https://youtu.be/fjRnFL91EZc\n project page: https://lukashoel.github.io/text-to-room/ code:\n https://github.com/lukasHoel/text2room"},{"id":"http://arxiv.org/abs/2309.05049v1","updated":"2023-09-10T14:54:44Z","published":"2023-09-10T14:54:44Z","title":"Multi-view Self-supervised Disentanglement for General Image Denoising","summary":" With its significant performance improvements, the deep learning paradigm has\nbecome a standard tool for modern image denoisers. While promising performance\nhas been shown on seen noise distributions, existing approaches often suffer\nfrom generalisation to unseen noise types or general and real noise. It is\nunderstandable as the model is designed to learn paired mapping (e.g. from a\nnoisy image to its clean version). In this paper, we instead propose to learn\nto disentangle the noisy image, under the intuitive assumption that different\ncorrupted versions of the same clean image share a common latent space. A\nself-supervised learning framework is proposed to achieve the goal, without\nlooking at the latent clean image. By taking two different corrupted versions\nof the same image as input, the proposed Multi-view Self-supervised\nDisentanglement (MeD) approach learns to disentangle the latent clean features\nfrom the corruptions and recover the clean image consequently. Extensive\nexperimental analysis on both synthetic and real noise shows the superiority of\nthe proposed method over prior self-supervised approaches, especially on unseen\nnovel noise types. On real noise, the proposed method even outperforms its\nsupervised counterparts by over 3 dB.\n","authors":["Hao Chen","Chenyuan Qu","Yu Zhang","Chen Chen","Jianbo Jiao"],"pdf_url":"https://arxiv.org/pdf/2309.05049v1.pdf","comment":"International Conference on Computer Vision 2023 (ICCV 2023)"},{"id":"http://arxiv.org/abs/2308.03135v2","updated":"2023-09-10T14:19:49Z","published":"2023-08-06T15:05:42Z","title":"E-CLIP: Towards Label-efficient Event-based Open-world Understanding by\n CLIP","summary":" Contrasting Language-image pertaining (CLIP) has recently shown promising\nopen-world and few-shot performance on 2D image-based recognition tasks.\nHowever, the transferred capability of CLIP to the novel event camera data\nstill remains under-explored. In particular, due to the modality gap with the\nimage-text data and the lack of large-scale datasets, achieving this goal is\nnon-trivial and thus requires significant research innovation. In this paper,\nwe propose E-CLIP, a novel and effective framework that unleashes the potential\nof CLIP for event-based recognition to compensate for the lack of large-scale\nevent-based datasets. Our work addresses two crucial challenges: 1) how to\ngeneralize CLIP's visual encoder to event data while fully leveraging events'\nunique properties, e.g., sparsity and high temporal resolution; 2) how to\neffectively align the multi-modal embeddings, i.e., image, text, and events. To\nthis end, we first introduce a novel event encoder that subtly models the\ntemporal information from events and meanwhile generates event prompts to\npromote the modality bridging. We then design a text encoder that generates\ncontent prompts and utilizes hybrid text prompts to enhance the E-CLIP's\ngeneralization ability across diverse datasets. With the proposed event\nencoder, text encoder, and original image encoder, a novel Hierarchical Triple\nContrastive Alignment (HTCA) module is introduced to jointly optimize the\ncorrelation and enable efficient knowledge transfer among the three modalities.\nWe conduct extensive experiments on two recognition benchmarks, and the results\ndemonstrate that our E-CLIP outperforms existing methods by a large margin of\n+3.94% and +4.62% on the N-Caltech dataset, respectively, in both fine-tuning\nand few-shot settings. Moreover, our E-CLIP can be flexibly extended to the\nevent retrieval task using both text or image queries, showing plausible\nperformance.\n","authors":["Jiazhou Zhou","Xu Zheng","Yuanhuiyi Lyu","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2308.03135v2.pdf","comment":"Jounal version with supplementary material"},{"id":"http://arxiv.org/abs/2309.05036v1","updated":"2023-09-10T14:15:01Z","published":"2023-09-10T14:15:01Z","title":"What Is Near?: Room Locality Learning for Enhanced Robot\n Vision-Language-Navigation in Indoor Living Environments","summary":" Humans use their knowledge of common house layouts obtained from previous\nexperiences to predict nearby rooms while navigating in new environments. This\ngreatly helps them navigate previously unseen environments and locate their\ntarget room. To provide layout prior knowledge to navigational agents based on\ncommon human living spaces, we propose WIN (\\textit{W}hat \\textit{I}s\n\\textit{N}ear), a commonsense learning model for Vision Language Navigation\n(VLN) tasks. VLN requires an agent to traverse indoor environments based on\ndescriptive navigational instructions. Unlike existing layout learning works,\nWIN predicts the local neighborhood map based on prior knowledge of living\nspaces and current observation, operating on an imagined global map of the\nentire environment. The model infers neighborhood regions based on visual cues\nof current observations, navigational history, and layout common sense. We show\nthat local-global planning based on locality knowledge and predicting the\nindoor layout allows the agent to efficiently select the appropriate action.\nSpecifically, we devised a cross-modal transformer that utilizes this locality\nprior for decision-making in addition to visual inputs and instructions.\nExperimental results show that locality learning using WIN provides better\ngeneralizability compared to classical VLN agents in unseen environments. Our\nmodel performs favorably on standard VLN metrics, with Success Rate 68\\% and\nSuccess weighted by Path Length 63\\% in unseen environments.\n","authors":["Muraleekrishna Gopinathan","Jumana Abu-Khalaf","David Suter","Sidike Paheding","Nathir A. Rawashdeh"],"pdf_url":"https://arxiv.org/pdf/2309.05036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05032v1","updated":"2023-09-10T14:10:56Z","published":"2023-09-10T14:10:56Z","title":"Unified Contrastive Fusion Transformer for Multimodal Human Action\n Recognition","summary":" Various types of sensors have been considered to develop human action\nrecognition (HAR) models. Robust HAR performance can be achieved by fusing\nmultimodal data acquired by different sensors. In this paper, we introduce a\nnew multimodal fusion architecture, referred to as Unified Contrastive Fusion\nTransformer (UCFFormer) designed to integrate data with diverse distributions\nto enhance HAR performance. Based on the embedding features extracted from each\nmodality, UCFFormer employs the Unified Transformer to capture the\ninter-dependency among embeddings in both time and modality domains. We present\nthe Factorized Time-Modality Attention to perform self-attention efficiently\nfor the Unified Transformer. UCFFormer also incorporates contrastive learning\nto reduce the discrepancy in feature distributions across various modalities,\nthus generating semantically aligned features for information fusion.\nPerformance evaluation conducted on two popular datasets, UTD-MHAD and NTU\nRGB+D, demonstrates that UCFFormer achieves state-of-the-art performance,\noutperforming competing methods by considerable margins.\n","authors":["Kyoung Ok Yang","Junho Koh","Jun Won Choi"],"pdf_url":"https://arxiv.org/pdf/2309.05032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09248v2","updated":"2023-09-10T13:57:49Z","published":"2023-03-16T11:53:29Z","title":"Cross-Dimensional Refined Learning for Real-Time 3D Visual Perception\n from Monocular Video","summary":" We present a novel real-time capable learning method that jointly perceives a\n3D scene's geometry structure and semantic labels. Recent approaches to\nreal-time 3D scene reconstruction mostly adopt a volumetric scheme, where a\nTruncated Signed Distance Function (TSDF) is directly regressed. However, these\nvolumetric approaches tend to focus on the global coherence of their\nreconstructions, which leads to a lack of local geometric detail. To overcome\nthis issue, we propose to leverage the latent geometric prior knowledge in 2D\nimage features by explicit depth prediction and anchored feature generation, to\nrefine the occupancy learning in TSDF volume. Besides, we find that this\ncross-dimensional feature refinement methodology can also be adopted for the\nsemantic segmentation task by utilizing semantic priors. Hence, we proposed an\nend-to-end cross-dimensional refinement neural network (CDRNet) to extract both\n3D mesh and 3D semantic labeling in real time. The experiment results show that\nthis method achieves a state-of-the-art 3D perception efficiency on multiple\ndatasets, which indicates the great potential of our method for industrial\napplications.\n","authors":["Ziyang Hong","C. Patrick Yue"],"pdf_url":"https://arxiv.org/pdf/2303.09248v2.pdf","comment":"Accpeted to ICCV 2023 Workshops. Project page:\n https://hafred.github.io/cdrnet/"},{"id":"http://arxiv.org/abs/2309.05028v1","updated":"2023-09-10T13:55:41Z","published":"2023-09-10T13:55:41Z","title":"SC-NeRF: Self-Correcting Neural Radiance Field with Sparse Views","summary":" In recent studies, the generalization of neural radiance fields for novel\nview synthesis task has been widely explored. However, existing methods are\nlimited to objects and indoor scenes. In this work, we extend the\ngeneralization task to outdoor scenes, trained only on object-level datasets.\nThis approach presents two challenges. Firstly, the significant distributional\nshift between training and testing scenes leads to black artifacts in rendering\nresults. Secondly, viewpoint changes in outdoor scenes cause ghosting or\nmissing regions in rendered images. To address these challenges, we propose a\ngeometric correction module and an appearance correction module based on\nmulti-head attention mechanisms. We normalize rendered depth and combine it\nwith light direction as query in the attention mechanism. Our network\neffectively corrects varying scene structures and geometric features in outdoor\nscenes, generalizing well from object-level to unseen outdoor scenes.\nAdditionally, we use appearance correction module to correct appearance\nfeatures, preventing rendering artifacts like blank borders and ghosting due to\nviewpoint changes. By combining these modules, our approach successfully\ntackles the challenges of outdoor scene generalization, producing high-quality\nrendering results. When evaluated on four datasets (Blender, DTU, LLFF,\nSpaces), our network outperforms previous methods. Notably, compared to\nMVSNeRF, our network improves average PSNR from 19.369 to 25.989, SSIM from\n0.838 to 0.889, and reduces LPIPS from 0.265 to 0.224 on Spaces outdoor scenes.\n","authors":["Liang Song","Guangming Wang","Jiuming Liu","Zhenyang Fu","Yanzi Miao"," Hesheng"],"pdf_url":"https://arxiv.org/pdf/2309.05028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05015v1","updated":"2023-09-10T12:26:17Z","published":"2023-09-10T12:26:17Z","title":"DeViT: Decomposing Vision Transformers for Collaborative Inference in\n Edge Devices","summary":" Recent years have witnessed the great success of vision transformer (ViT),\nwhich has achieved state-of-the-art performance on multiple computer vision\nbenchmarks. However, ViT models suffer from vast amounts of parameters and high\ncomputation cost, leading to difficult deployment on resource-constrained edge\ndevices. Existing solutions mostly compress ViT models to a compact model but\nstill cannot achieve real-time inference. To tackle this issue, we propose to\nexplore the divisibility of transformer structure, and decompose the large ViT\ninto multiple small models for collaborative inference at edge devices. Our\nobjective is to achieve fast and energy-efficient collaborative inference while\nmaintaining comparable accuracy compared with large ViTs. To this end, we first\npropose a collaborative inference framework termed DeViT to facilitate edge\ndeployment by decomposing large ViTs. Subsequently, we design a\ndecomposition-and-ensemble algorithm based on knowledge distillation, termed\nDEKD, to fuse multiple small decomposed models while dramatically reducing\ncommunication overheads, and handle heterogeneous models by developing a\nfeature matching module to promote the imitations of decomposed models from the\nlarge ViT. Extensive experiments for three representative ViT backbones on four\nwidely-used datasets demonstrate our method achieves efficient collaborative\ninference for ViTs and outperforms existing lightweight ViTs, striking a good\ntrade-off between efficiency and accuracy. For example, our DeViTs improves\nend-to-end latency by 2.89$\\times$ with only 1.65% accuracy sacrifice using\nCIFAR-100 compared to the large ViT, ViT-L/16, on the GPU server. DeDeiTs\nsurpasses the recent efficient ViT, MobileViT-S, by 3.54% in accuracy on\nImageNet-1K, while running 1.72$\\times$ faster and requiring 55.28% lower\nenergy consumption on the edge device.\n","authors":["Guanyu Xu","Zhiwei Hao","Yong Luo","Han Hu","Jianping An","Shiwen Mao"],"pdf_url":"https://arxiv.org/pdf/2309.05015v1.pdf","comment":"Accepted by IEEE Transactions on Mobile Computing"},{"id":"http://arxiv.org/abs/2309.05013v1","updated":"2023-09-10T12:21:42Z","published":"2023-09-10T12:21:42Z","title":"Geometrically Consistent Partial Shape Matching","summary":" Finding correspondences between 3D shapes is a crucial problem in computer\nvision and graphics, which is for example relevant for tasks like shape\ninterpolation, pose transfer, or texture transfer. An often neglected but\nessential property of matchings is geometric consistency, which means that\nneighboring triangles in one shape are consistently matched to neighboring\ntriangles in the other shape. Moreover, while in practice one often has only\naccess to partial observations of a 3D shape (e.g. due to occlusion, or\nscanning artifacts), there do not exist any methods that directly address\ngeometrically consistent partial shape matching. In this work we fill this gap\nby proposing to integrate state-of-the-art deep shape features into a novel\ninteger linear programming partial shape matching formulation. Our optimization\nyields a globally optimal solution on low resolution shapes, which we then\nrefine using a coarse-to-fine scheme. We show that our method can find more\nreliable results on partial shapes in comparison to existing geometrically\nconsistent algorithms (for which one first has to fill missing parts with a\ndummy geometry). Moreover, our matchings are substantially smoother than\nlearning-based state-of-the-art shape matching methods.\n","authors":["Viktoria Ehm","Paul Roetzer","Marvin Eisenberger","Maolin Gao","Florian Bernard","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2309.05013v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.13013v5","updated":"2023-09-10T11:46:01Z","published":"2022-01-31T06:11:28Z","title":"A Simple And Effective Filtering Scheme For Improving Neural Fields","summary":" Recently, neural fields, also known as coordinate-based MLPs, have achieved\nimpressive results in representing low-dimensional data. Unlike CNN, MLPs are\nglobally connected and lack local control; adjusting a local region leads to\nglobal changes. Therefore, improving local neural fields usually leads to a\ndilemma: filtering out local artifacts can simultaneously smooth away desired\ndetails. Our solution is a new filtering technique that consists of two\ncounteractive operators: a smoothing operator that provides global smoothing\nfor better generalization, and conversely a recovering operator that provides\nbetter controllability for local adjustments. We have found that using either\noperator alone can lead to an increase in noisy artifacts or oversmoothed\nregions. By combining the two operators, smoothing and sharpening can be\nadjusted to first smooth the entire region and then recover fine-grained\ndetails in regions overly smoothed. In this way, our filter helps neural fields\nremove much noise while enhancing details. We demonstrate the benefits of our\nfilter on various tasks and show significant improvements over state-of-the-art\nmethods. Moreover, our filter also provides better performance in terms of\nconvergence speed and network stability.\n","authors":["Yixin Zhuang"],"pdf_url":"https://arxiv.org/pdf/2201.13013v5.pdf","comment":"Accepted to Computational Visual Media"},{"id":"http://arxiv.org/abs/2308.08197v4","updated":"2023-09-10T10:53:48Z","published":"2023-08-16T07:57:35Z","title":"Self-Reference Deep Adaptive Curve Estimation for Low-Light Image\n Enhancement","summary":" In this paper, we propose a 2-stage low-light image enhancement method called\nSelf-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage,\nwe present an intuitive, lightweight, fast, and unsupervised luminance\nenhancement algorithm. The algorithm is based on a novel low-light enhancement\ncurve that can be used to locally boost image brightness. We also propose a new\nloss function with a simplified physical model designed to preserve natural\nimages' color, structure, and fidelity. We use a vanilla CNN to map each pixel\nthrough deep Adaptive Adjustment Curves (AAC) while preserving the local image\nstructure. Secondly, we introduce the corresponding denoising scheme to remove\nthe latent noise in the darkness. We approximately model the noise in the dark\nand deploy a Denoising-Net to estimate and remove the noise after the first\nstage. Exhaustive qualitative and quantitative analysis shows that our method\noutperforms existing state-of-the-art algorithms on multiple real-world\ndatasets.\n","authors":["Jianyu Wen","Chenhao Wu","Tong Zhang","Yixuan Yu","Piotr Swierczynski"],"pdf_url":"https://arxiv.org/pdf/2308.08197v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04967v1","updated":"2023-09-10T09:00:28Z","published":"2023-09-10T09:00:28Z","title":"Towards Fully Decoupled End-to-End Person Search","summary":" End-to-end person search aims to jointly detect and re-identify a target\nperson in raw scene images with a unified model. The detection task unifies all\npersons while the re-id task discriminates different identities, resulting in\nconflict optimal objectives. Existing works proposed to decouple end-to-end\nperson search to alleviate such conflict. Yet these methods are still\nsub-optimal on one or two of the sub-tasks due to their partially decoupled\nmodels, which limits the overall person search performance. In this paper, we\npropose to fully decouple person search towards optimal person search. A\ntask-incremental person search network is proposed to incrementally construct\nan end-to-end model for the detection and re-id sub-task, which decouples the\nmodel architecture for the two sub-tasks. The proposed task-incremental network\nallows task-incremental training for the two conflicting tasks. This enables\nindependent learning for different objectives thus fully decoupled the model\nfor persons earch. Comprehensive experimental evaluations demonstrate the\neffectiveness of the proposed fully decoupled models for end-to-end person\nsearch.\n","authors":["Pengcheng Zhang","Xiao Bai","Jin Zheng","Xin Ning"],"pdf_url":"https://arxiv.org/pdf/2309.04967v1.pdf","comment":"DICTA 2023"},{"id":"http://arxiv.org/abs/2302.09187v2","updated":"2023-09-10T08:59:35Z","published":"2023-02-17T23:39:34Z","title":"Video Action Recognition Collaborative Learning with Dynamics via\n PSO-ConvNet Transformer","summary":" Recognizing human actions in video sequences, known as Human Action\nRecognition (HAR), is a challenging task in pattern recognition. While\nConvolutional Neural Networks (ConvNets) have shown remarkable success in image\nrecognition, they are not always directly applicable to HAR, as temporal\nfeatures are critical for accurate classification. In this paper, we propose a\nnovel dynamic PSO-ConvNet model for learning actions in videos, building on our\nrecent work in image recognition. Our approach leverages a framework where the\nweight vector of each neural network represents the position of a particle in\nphase space, and particles share their current weight vectors and gradient\nestimates of the Loss function. To extend our approach to video, we integrate\nConvNets with state-of-the-art temporal methods such as Transformer and\nRecurrent Neural Networks. Our experimental results on the UCF-101 dataset\ndemonstrate substantial improvements of up to 9% in accuracy, which confirms\nthe effectiveness of our proposed method. In addition, we conducted experiments\non larger and more variety of datasets including Kinetics-400 and HMDB-51 and\nobtained preference for Collaborative Learning in comparison with\nNon-Collaborative Learning (Individual Learning). Overall, our dynamic\nPSO-ConvNet model provides a promising direction for improving HAR by better\ncapturing the spatio-temporal dynamics of human actions in videos. The code is\navailable at\nhttps://github.com/leonlha/Video-Action-Recognition-Collaborative-Learning-with-Dynamics-via-PSO-ConvNet-Transformer.\n","authors":["Nguyen Huu Phong","Bernardete Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2302.09187v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2309.04965v1","updated":"2023-09-10T08:55:24Z","published":"2023-09-10T08:55:24Z","title":"Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image\n Captioning","summary":" While impressive performance has been achieved in image captioning, the\nlimited diversity of the generated captions and the large parameter scale\nremain major barriers to the real-word application of these systems. In this\nwork, we propose a lightweight image captioning network in combination with\ncontinuous diffusion, called Prefix-diffusion. To achieve diversity, we design\nan efficient method that injects prefix image embeddings into the denoising\nprocess of the diffusion model. In order to reduce trainable parameters, we\nemploy a pre-trained model to extract image features and further design an\nextra mapping network. Prefix-diffusion is able to generate diverse captions\nwith relatively less parameters, while maintaining the fluency and relevance of\nthe captions benefiting from the generative capabilities of the diffusion\nmodel. Our work paves the way for scaling up diffusion models for image\ncaptioning, and achieves promising performance compared with recent approaches.\n","authors":["Guisheng Liu","Yi Li","Zhengcong Fei","Haiyan Fu","Xiangyang Luo","Yanqing Guo"],"pdf_url":"https://arxiv.org/pdf/2309.04965v1.pdf","comment":"11 pages,4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.04961v1","updated":"2023-09-10T08:23:52Z","published":"2023-09-10T08:23:52Z","title":"Multi-modal Extreme Classification","summary":" This paper develops the MUFIN technique for extreme classification (XC) tasks\nwith millions of labels where datapoints and labels are endowed with visual and\ntextual descriptors. Applications of MUFIN to product-to-product recommendation\nand bid query prediction over several millions of products are presented.\nContemporary multi-modal methods frequently rely on purely embedding-based\nmethods. On the other hand, XC methods utilize classifier architectures to\noffer superior accuracies than embedding only methods but mostly focus on\ntext-based categorization tasks. MUFIN bridges this gap by reformulating\nmulti-modal categorization as an XC problem with several millions of labels.\nThis presents the twin challenges of developing multi-modal architectures that\ncan offer embeddings sufficiently expressive to allow accurate categorization\nover millions of labels; and training and inference routines that scale\nlogarithmically in the number of labels. MUFIN develops an architecture based\non cross-modal attention and trains it in a modular fashion using pre-training\nand positive and negative mining. A novel product-to-product recommendation\ndataset MM-AmazonTitles-300K containing over 300K products was curated from\npublicly available amazon.com listings with each product endowed with a title\nand multiple images. On the all datasets MUFIN offered at least 3% higher\naccuracy than leading text-based, image-based and multi-modal techniques. Code\nfor MUFIN is available at https://github.com/Extreme-classification/MUFIN\n","authors":["Anshul Mittal","Kunal Dahiya","Shreya Malani","Janani Ramaswamy","Seba Kuruvilla","Jitendra Ajmera","Keng-hao Chang","Sumeet Agarwal","Purushottam Kar","Manik Varma"],"pdf_url":"https://arxiv.org/pdf/2309.04961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04960v1","updated":"2023-09-10T08:16:02Z","published":"2023-09-10T08:16:02Z","title":"SdCT-GAN: Reconstructing CT from Biplanar X-Rays with Self-driven\n Generative Adversarial Networks","summary":" Computed Tomography (CT) is a medical imaging modality that can generate more\ninformative 3D images than 2D X-rays. However, this advantage comes at the\nexpense of more radiation exposure, higher costs, and longer acquisition time.\nHence, the reconstruction of 3D CT images using a limited number of 2D X-rays\nhas gained significant importance as an economical alternative. Nevertheless,\nexisting methods primarily prioritize minimizing pixel/voxel-level intensity\ndiscrepancies, often neglecting the preservation of textural details in the\nsynthesized images. This oversight directly impacts the quality of the\nreconstructed images and thus affects the clinical diagnosis. To address the\ndeficits, this paper presents a new self-driven generative adversarial network\nmodel (SdCT-GAN), which is motivated to pay more attention to image details by\nintroducing a novel auto-encoder structure in the discriminator. In addition, a\nSobel Gradient Guider (SGG) idea is applied throughout the model, where the\nedge information from the 2D X-ray image at the input can be integrated.\nMoreover, LPIPS (Learned Perceptual Image Patch Similarity) evaluation metric\nis adopted that can quantitatively evaluate the fine contours and textures of\nreconstructed images better than the existing ones. Finally, the qualitative\nand quantitative results of the empirical studies justify the power of the\nproposed model compared to mainstream state-of-the-art baselines.\n","authors":["Shuangqin Cheng","Qingliang Chen","Qiyi Zhang","Ming Li","Yamuhanmode Alike","Kaile Su","Pengcheng Wen"],"pdf_url":"https://arxiv.org/pdf/2309.04960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04958v1","updated":"2023-09-10T08:14:41Z","published":"2023-09-10T08:14:41Z","title":"Semi-Supervised learning for Face Anti-Spoofing using Apex frame","summary":" Conventional feature extraction techniques in the face anti-spoofing domain\neither analyze the entire video sequence or focus on a specific segment to\nimprove model performance. However, identifying the optimal frames that provide\nthe most valuable input for the face anti-spoofing remains a challenging task.\nIn this paper, we address this challenge by employing Gaussian weighting to\ncreate apex frames for videos. Specifically, an apex frame is derived from a\nvideo by computing a weighted sum of its frames, where the weights are\ndetermined using a Gaussian distribution centered around the video's central\nframe. Furthermore, we explore various temporal lengths to produce multiple\nunlabeled apex frames using a Gaussian function, without the need for\nconvolution. By doing so, we leverage the benefits of semi-supervised learning,\nwhich considers both labeled and unlabeled apex frames to effectively\ndiscriminate between live and spoof classes. Our key contribution emphasizes\nthe apex frame's capacity to represent the most significant moments in the\nvideo, while unlabeled apex frames facilitate efficient semi-supervised\nlearning, as they enable the model to learn from videos of varying temporal\nlengths. Experimental results using four face anti-spoofing databases: CASIA,\nREPLAY-ATTACK, OULU-NPU, and MSU-MFSD demonstrate the apex frame's efficacy in\nadvancing face anti-spoofing techniques.\n","authors":["Usman Muhammad","Mourad Oussalah","Jorma Laaksonen"],"pdf_url":"https://arxiv.org/pdf/2309.04958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04956v1","updated":"2023-09-10T08:07:58Z","published":"2023-09-10T08:07:58Z","title":"Anatomy Completor: A Multi-class Completion Framework for 3D Anatomy\n Reconstruction","summary":" In this paper, we introduce a completion framework to reconstruct the\ngeometric shapes of various anatomies, including organs, vessels and muscles.\nOur work targets a scenario where one or multiple anatomies are missing in the\nimaging data due to surgical, pathological or traumatic factors, or simply\nbecause these anatomies are not covered by image acquisition. Automatic\nreconstruction of the missing anatomies benefits many applications, such as\norgan 3D bio-printing, whole-body segmentation, animation realism,\npaleoradiology and forensic imaging. We propose two paradigms based on a 3D\ndenoising auto-encoder (DAE) to solve the anatomy reconstruction problem: (i)\nthe DAE learns a many-to-one mapping between incomplete and complete instances;\n(ii) the DAE learns directly a one-to-one residual mapping between the\nincomplete instances and the target anatomies. We apply a loss aggregation\nscheme that enables the DAE to learn the many-to-one mapping more effectively\nand further enhances the learning of the residual mapping. On top of this, we\nextend the DAE to a multiclass completor by assigning a unique label to each\nanatomy involved. We evaluate our method using a CT dataset with whole-body\nsegmentations. Results show that our method produces reasonable anatomy\nreconstructions given instances with different levels of incompleteness (i.e.,\none or multiple random anatomies are missing). Codes and pretrained models are\npublicly available at https://github.com/Jianningli/medshapenet-feedback/\ntree/main/anatomy-completor\n","authors":["Jianning Li","Antonio Pepe","Gijs Luijten","Christina Schwarz-Gsaxner","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2309.04956v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2211.06897v2","updated":"2023-09-10T07:12:27Z","published":"2022-11-13T13:08:59Z","title":"Batch-based Model Registration for Fast 3D Sherd Reconstruction","summary":" 3D reconstruction techniques have widely been used for digital documentation\nof archaeological fragments. However, efficient digital capture of fragments\nremains as a challenge. In this work, we aim to develop a portable,\nhigh-throughput, and accurate reconstruction system for efficient digitization\nof fragments excavated in archaeological sites. To realize high-throughput\ndigitization of large numbers of objects, an effective strategy is to perform\nscanning and reconstruction in batches. However, effective batch-based scanning\nand reconstruction face two key challenges: 1) how to correlate partial scans\nof the same object from multiple batch scans, and 2) how to register and\nreconstruct complete models from partial scans that exhibit only small\noverlaps. To tackle these two challenges, we develop a new batch-based matching\nalgorithm that pairs the front and back sides of the fragments, and a new\nBilateral Boundary ICP algorithm that can register partial scans sharing very\nnarrow overlapping regions. Extensive validation in labs and testing in\nexcavation sites demonstrate that these designs enable efficient batch-based\nscanning for fragments. We show that such a batch-based scanning and\nreconstruction pipeline can have immediate applications on digitizing sherds in\narchaeological excavations. Our project page:\nhttps://jiepengwang.github.io/FIRES/.\n","authors":["Jiepeng Wang","Congyi Zhang","Peng Wang","Xin Li","Peter J. Cobb","Christian Theobalt","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2211.06897v2.pdf","comment":"Project page: https://jiepengwang.github.io/FIRES/"},{"id":"http://arxiv.org/abs/2309.04946v1","updated":"2023-09-10T06:33:17Z","published":"2023-09-10T06:33:17Z","title":"Efficient Emotional Adaptation for Audio-Driven Talking-Head Generation","summary":" Audio-driven talking-head synthesis is a popular research topic for virtual\nhuman-related applications. However, the inflexibility and inefficiency of\nexisting methods, which necessitate expensive end-to-end training to transfer\nemotions from guidance videos to talking-head predictions, are significant\nlimitations. In this work, we propose the Emotional Adaptation for Audio-driven\nTalking-head (EAT) method, which transforms emotion-agnostic talking-head\nmodels into emotion-controllable ones in a cost-effective and efficient manner\nthrough parameter-efficient adaptations. Our approach utilizes a pretrained\nemotion-agnostic talking-head transformer and introduces three lightweight\nadaptations (the Deep Emotional Prompts, Emotional Deformation Network, and\nEmotional Adaptation Module) from different perspectives to enable precise and\nrealistic emotion controls. Our experiments demonstrate that our approach\nachieves state-of-the-art performance on widely-used benchmarks, including LRW\nand MEAD. Additionally, our parameter-efficient adaptations exhibit remarkable\ngeneralization ability, even in scenarios where emotional training videos are\nscarce or nonexistent. Project website: https://yuangan.github.io/eat/\n","authors":["Yuan Gan","Zongxin Yang","Xihang Yue","Lingyun Sun","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2309.04946v1.pdf","comment":"Accepted to ICCV 2023. Project page: https://yuangan.github.io/eat/"},{"id":"http://arxiv.org/abs/2211.06797v2","updated":"2023-09-10T05:28:09Z","published":"2022-11-13T03:16:36Z","title":"Perceptual Video Coding for Machines via Satisfied Machine Ratio\n Modeling","summary":" Video Coding for Machines (VCM) aims to compress visual signals for machine\nanalysis. However, existing methods only consider a few machines, neglecting\nthe majority. Moreover, the machine perceptual characteristics are not\neffectively leveraged, leading to suboptimal compression efficiency. In this\npaper, we introduce Satisfied Machine Ratio (SMR) to address these issues. SMR\nstatistically measures the quality of compressed images and videos for machines\nby aggregating satisfaction scores from them. Each score is calculated based on\nthe difference in machine perceptions between original and compressed images.\nTargeting image classification and object detection tasks, we build two\nrepresentative machine libraries for SMR annotation and construct a large-scale\nSMR dataset to facilitate SMR studies. We then propose an SMR prediction model\nbased on the correlation between deep features differences and SMR.\nFurthermore, we introduce an auxiliary task to increase the prediction accuracy\nby predicting the SMR difference between two images in different quality\nlevels. Extensive experiments demonstrate that using the SMR models\nsignificantly improves compression performance for VCM, and the SMR models\ngeneralize well to unseen machines, traditional and neural codecs, and\ndatasets. In summary, SMR enables perceptual coding for machines and advances\nVCM from specificity to generality. Code is available at\n\\url{https://github.com/ywwynm/SMR}.\n","authors":["Qi Zhang","Shanshe Wang","Xinfeng Zhang","Chuanmin Jia","Zhao Wang","Siwei Ma","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2211.06797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.04231v3","updated":"2023-09-10T05:16:07Z","published":"2022-06-09T02:47:29Z","title":"JNMR: Joint Non-linear Motion Regression for Video Frame Interpolation","summary":" Video frame interpolation (VFI) aims to generate predictive frames by warping\nlearnable motions from the bidirectional historical references. Most existing\nworks utilize spatio-temporal semantic information extractor to realize motion\nestimation and interpolation modeling. However, they insufficiently consider\nthe real mechanistic rationality of generated middle motions. In this paper, we\nreformulate VFI as a Joint Non-linear Motion Regression (JNMR) strategy to\nmodel the complicated motions of inter-frame. Specifically, the motion\ntrajectory between the target frame and the multiple reference frames is\nregressed by a temporal concatenation of multi-stage quadratic models. ConvLSTM\nis adopted to construct this joint distribution of complete motions in temporal\ndimension. Moreover, the feature learning network is designed to optimize for\nthe joint regression modeling. A coarse-to-fine synthesis enhancement module is\nalso conducted to learn visual dynamics at different resolutions through\nrepetitive regression and interpolation. Experimental results on VFI show that\nthe effectiveness and significant improvement of joint motion regression\ncompared with the state-of-the-art methods. The code is available at\nhttps://github.com/ruhig6/JNMR.\n","authors":["Meiqin Liu","Chenming Xu","Chao Yao","Chunyu Lin","Yao Zhao"],"pdf_url":"https://arxiv.org/pdf/2206.04231v3.pdf","comment":"Accepted by IEEE Transactions on Image Processing (TIP)"},{"id":"http://arxiv.org/abs/2309.04917v1","updated":"2023-09-10T02:31:50Z","published":"2023-09-10T02:31:50Z","title":"Text-driven Editing of 3D Scenes without Retraining","summary":" Numerous diffusion models have recently been applied to image synthesis and\nediting. However, editing 3D scenes is still in its early stages. It poses\nvarious challenges, such as the requirement to design specific methods for\ndifferent editing types, retraining new models for various 3D scenes, and the\nabsence of convenient human interaction during editing. To tackle these issues,\nwe introduce a text-driven editing method, termed DN2N, which allows for the\ndirect acquisition of a NeRF model with universal editing capabilities,\neliminating the requirement for retraining. Our method employs off-the-shelf\ntext-based editing models of 2D images to modify the 3D scene images, followed\nby a filtering process to discard poorly edited images that disrupt 3D\nconsistency. We then consider the remaining inconsistency as a problem of\nremoving noise perturbation, which can be solved by generating training data\nwith similar perturbation characteristics for training. We further propose\ncross-view regularization terms to help the generalized NeRF model mitigate\nthese perturbations. Our text-driven method allows users to edit a 3D scene\nwith their desired description, which is more friendly, intuitive, and\npractical than prior works. Empirical results show that our method achieves\nmultiple editing types, including but not limited to appearance editing,\nweather transition, material changing, and style transfer. Most importantly,\nour method generalizes well with editing abilities shared among a set of model\nparameters without requiring a customized editing model for some specific\nscenes, thus inferring novel views with editing effects directly from user\ninput. The project website is available at http://sk-fun.fun/DN2N\n","authors":["Shuangkang Fang","Yufeng Wang","Yi Yang","Yi-Hsuan Tsai","Wenrui Ding","Ming-Hsuan Yang","Shuchang Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.04917v1.pdf","comment":"Project Website: http://sk-fun.fun/DN2N"},{"id":"http://arxiv.org/abs/2309.04914v1","updated":"2023-09-10T02:02:29Z","published":"2023-09-10T02:02:29Z","title":"MFPNet: Multi-scale Feature Propagation Nwtwork For Lightweight Semantic\n Segmentation","summary":" In contrast to the abundant research focusing on large-scale models, the\nprogress in lightweight semantic segmentation appears to be advancing at a\ncomparatively slower pace. However, existing compact methods often suffer from\nlimited feature representation capability due to the shallowness of their\nnetworks. In this paper, we propose a novel lightweight segmentation\narchitecture, called Multi-scale Feature Propagation Network (MFPNet), to\naddress the dilemma. Specifically, we design a robust Encoder-Decoder structure\nfeaturing symmetrical residual blocks that consist of flexible bottleneck\nresidual modules (BRMs) to explore deep and rich muti-scale semantic context.\nFurthermore, taking benefit from their capacity to model latent long-range\ncontextual relationships, we leverage Graph Convolutional Networks (GCNs) to\nfacilitate multi-scale feature propagation between the BRM blocks. When\nevaluated on benchmark datasets, our proposed approach shows superior\nsegmentation results.\n","authors":["Guoan Xu","Wenjing Jia","Tao Wu","Ligeng Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04914v1.pdf","comment":"5 pages, 3 figures, 5tables, conference"},{"id":"http://arxiv.org/abs/2309.04907v1","updated":"2023-09-10T01:23:05Z","published":"2023-09-10T01:23:05Z","title":"Effective Real Image Editing with Accelerated Iterative Diffusion\n Inversion","summary":" Despite all recent progress, it is still challenging to edit and manipulate\nnatural images with modern generative models. When using Generative Adversarial\nNetwork (GAN), one major hurdle is in the inversion process mapping a real\nimage to its corresponding noise vector in the latent space, since its\nnecessary to be able to reconstruct an image to edit its contents. Likewise for\nDenoising Diffusion Implicit Models (DDIM), the linearization assumption in\neach inversion step makes the whole deterministic inversion process unreliable.\nExisting approaches that have tackled the problem of inversion stability often\nincur in significant trade-offs in computational efficiency. In this work we\npropose an Accelerated Iterative Diffusion Inversion method, dubbed AIDI, that\nsignificantly improves reconstruction accuracy with minimal additional overhead\nin space and time complexity. By using a novel blended guidance technique, we\nshow that effective results can be obtained on a large range of image editing\ntasks without large classifier-free guidance in inversion. Furthermore, when\ncompared with other diffusion inversion based works, our proposed process is\nshown to be more robust for fast image editing in the 10 and 20 diffusion\nsteps' regimes.\n","authors":["Zhihong Pan","Riccardo Gherardi","Xiufeng Xie","Stephen Huang"],"pdf_url":"https://arxiv.org/pdf/2309.04907v1.pdf","comment":"Accepted to ICCV 2023 (Oral)"},{"id":"http://arxiv.org/abs/2207.06150v2","updated":"2023-09-10T00:35:41Z","published":"2022-07-13T12:19:01Z","title":"Estimating the Power Consumption of Heterogeneous Devices when\n performing AI Inference","summary":" Modern-day life is driven by electronic devices connected to the internet.\nThe emerging research field of the Internet-of-Things (IoT) has become popular,\njust as there has been a steady increase in the number of connected devices.\nSince many of these devices are utilised to perform CV tasks, it is essential\nto understand their power consumption against performance. We report the power\nconsumption profile and analysis of the NVIDIA Jetson Nano board while\nperforming object classification. The authors present an extensive analysis\nregarding power consumption per frame and the output in frames per second using\nYOLOv5 models. The results show that the YOLOv5n outperforms other YOLOV5\nvariants in terms of throughput (i.e. 12.34 fps) and low power consumption\n(i.e. 0.154 mWh/frame).\n","authors":["Pedro Machado","Ivica Matic","Francisco de Lemos","Isibor Kennedy Ihianle","David Ada Adama"],"pdf_url":"https://arxiv.org/pdf/2207.06150v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04902v1","updated":"2023-09-10T00:08:29Z","published":"2023-09-10T00:08:29Z","title":"Transformers in Small Object Detection: A Benchmark and Survey of\n State-of-the-Art","summary":" Transformers have rapidly gained popularity in computer vision, especially in\nthe field of object recognition and detection. Upon examining the outcomes of\nstate-of-the-art object detection methods, we noticed that transformers\nconsistently outperformed well-established CNN-based detectors in almost every\nvideo or image dataset. While transformer-based approaches remain at the\nforefront of small object detection (SOD) techniques, this paper aims to\nexplore the performance benefits offered by such extensive networks and\nidentify potential reasons for their SOD superiority. Small objects have been\nidentified as one of the most challenging object types in detection frameworks\ndue to their low visibility. We aim to investigate potential strategies that\ncould enhance transformers' performance in SOD. This survey presents a taxonomy\nof over 60 research studies on developed transformers for the task of SOD,\nspanning the years 2020 to 2023. These studies encompass a variety of detection\napplications, including small object detection in generic images, aerial\nimages, medical images, active millimeter images, underwater images, and\nvideos. We also compile and present a list of 12 large-scale datasets suitable\nfor SOD that were overlooked in previous studies and compare the performance of\nthe reviewed studies using popular metrics such as mean Average Precision\n(mAP), Frames Per Second (FPS), number of parameters, and more. Researchers can\nkeep track of newer studies on our web page, which is available at\n\\url{https://github.com/arekavandi/Transformer-SOD}.\n","authors":["Aref Miri Rekavandi","Shima Rashidi","Farid Boussaid","Stephen Hoefs","Emre Akbas","Mohammed bennamoun"],"pdf_url":"https://arxiv.org/pdf/2309.04902v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.05127v1","updated":"2023-09-10T20:22:56Z","published":"2023-09-10T20:22:56Z","title":"Learning Personalized User Preference from Cold Start in Multi-turn\n Conversations","summary":" This paper presents a novel teachable conversation interaction system that is\ncapable of learning users preferences from cold start by gradually adapting to\npersonal preferences. In particular, the TAI system is able to automatically\nidentify and label user preference in live interactions, manage dialogue flows\nfor interactive teaching sessions, and reuse learned preference for preference\nelicitation. We develop the TAI system by leveraging BERT encoder models to\nencode both dialogue and relevant context information, and build action\nprediction (AP), argument filling (AF) and named entity recognition (NER)\nmodels to understand the teaching session. We adopt a seeker-provider\ninteraction loop mechanism to generate diverse dialogues from cold-start. TAI\nis capable of learning user preference, which achieves 0.9122 turn level\naccuracy on out-of-sample dataset, and has been successfully adopted in\nproduction.\n","authors":["Deguang Kong","Abhay Jha","Lei Yun"],"pdf_url":"https://arxiv.org/pdf/2309.05127v1.pdf","comment":"preference, personalization, cold-start, dialogue, LLM. embedding"},{"id":"http://arxiv.org/abs/2309.05113v1","updated":"2023-09-10T19:01:12Z","published":"2023-09-10T19:01:12Z","title":"Personalized Search Via Neural Contextual Semantic Relevance Ranking","summary":" Existing neural relevance models do not give enough consideration for query\nand item context information which diversifies the search results to adapt for\npersonal preference. To bridge this gap, this paper presents a neural learning\nframework to personalize document ranking results by leveraging the signals to\ncapture how the document fits into users' context. In particular, it models the\nrelationships between document content and user query context using both\nlexical representations and semantic embeddings such that the user's intent can\nbe better understood by data enrichment of personalized query context\ninformation. Extensive experiments performed on the search dataset, demonstrate\nthe effectiveness of the proposed method.\n","authors":["Deguang Kong","Daniel Zhou","Zhiheng Huang","Steph Sigalas"],"pdf_url":"https://arxiv.org/pdf/2309.05113v1.pdf","comment":"Contextual, Personalization, Search, Semantics, LLM, embedding"},{"id":"http://arxiv.org/abs/2309.05035v1","updated":"2023-09-10T14:13:54Z","published":"2023-09-10T14:13:54Z","title":"Duplicate Question Retrieval and Confirmation Time Prediction in\n Software Communities","summary":" Community Question Answering (CQA) in different domains is growing at a large\nscale because of the availability of several platforms and huge shareable\ninformation among users. With the rapid growth of such online platforms, a\nmassive amount of archived data makes it difficult for moderators to retrieve\npossible duplicates for a new question and identify and confirm existing\nquestion pairs as duplicates at the right time. This problem is even more\ncritical in CQAs corresponding to large software systems like askubuntu where\nmoderators need to be experts to comprehend something as a duplicate. Note that\nthe prime challenge in such CQA platforms is that the moderators are themselves\nexperts and are therefore usually extremely busy with their time being\nextraordinarily expensive. To facilitate the task of the moderators, in this\nwork, we have tackled two significant issues for the askubuntu CQA platform:\n(1) retrieval of duplicate questions given a new question and (2) duplicate\nquestion confirmation time prediction. In the first task, we focus on\nretrieving duplicate questions from a question pool for a particular newly\nposted question. In the second task, we solve a regression problem to rank a\npair of questions that could potentially take a long time to get confirmed as\nduplicates. For duplicate question retrieval, we propose a Siamese neural\nnetwork based approach by exploiting both text and network-based features,\nwhich outperforms several state-of-the-art baseline techniques. Our method\noutperforms DupPredictor and DUPE by 5% and 7% respectively. For duplicate\nconfirmation time prediction, we have used both the standard machine learning\nmodels and neural network along with the text and graph-based features. We\nobtain Spearman's rank correlation of 0.20 and 0.213 (statistically\nsignificant) for text and graph based features respectively.\n","authors":["Rima Hazra","Debanjan Saha","Amruit Sahoo","Somnath Banerjee","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2309.05035v1.pdf","comment":"Full paper accepted at ASONAM 2023: The 2023 IEEE/ACM International\n Conference on Advances in Social Networks Analysis and Mining"},{"id":"http://arxiv.org/abs/2309.04981v1","updated":"2023-09-10T10:09:21Z","published":"2023-09-10T10:09:21Z","title":"Streamlined Data Fusion: Unleashing the Power of Linear Combination with\n Minimal Relevance Judgments","summary":" Linear combination is a potent data fusion method in information retrieval\ntasks, thanks to its ability to adjust weights for diverse scenarios. However,\nachieving optimal weight training has traditionally required manual relevance\njudgments on a large percentage of documents, a labor-intensive and expensive\nprocess. In this study, we investigate the feasibility of obtaining\nnear-optimal weights using a mere 20\\%-50\\% of relevant documents. Through\nexperiments on four TREC datasets, we find that weights trained with multiple\nlinear regression using this reduced set closely rival those obtained with\nTREC's official \"qrels.\" Our findings unlock the potential for more efficient\nand affordable data fusion, empowering researchers and practitioners to reap\nits full benefits with significantly less effort.\n","authors":["Qiuyu Xua","Yidong Huanga","Shengli Wua"],"pdf_url":"https://arxiv.org/pdf/2309.04981v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.04961v1","updated":"2023-09-10T08:23:52Z","published":"2023-09-10T08:23:52Z","title":"Multi-modal Extreme Classification","summary":" This paper develops the MUFIN technique for extreme classification (XC) tasks\nwith millions of labels where datapoints and labels are endowed with visual and\ntextual descriptors. Applications of MUFIN to product-to-product recommendation\nand bid query prediction over several millions of products are presented.\nContemporary multi-modal methods frequently rely on purely embedding-based\nmethods. On the other hand, XC methods utilize classifier architectures to\noffer superior accuracies than embedding only methods but mostly focus on\ntext-based categorization tasks. MUFIN bridges this gap by reformulating\nmulti-modal categorization as an XC problem with several millions of labels.\nThis presents the twin challenges of developing multi-modal architectures that\ncan offer embeddings sufficiently expressive to allow accurate categorization\nover millions of labels; and training and inference routines that scale\nlogarithmically in the number of labels. MUFIN develops an architecture based\non cross-modal attention and trains it in a modular fashion using pre-training\nand positive and negative mining. A novel product-to-product recommendation\ndataset MM-AmazonTitles-300K containing over 300K products was curated from\npublicly available amazon.com listings with each product endowed with a title\nand multiple images. On the all datasets MUFIN offered at least 3% higher\naccuracy than leading text-based, image-based and multi-modal techniques. Code\nfor MUFIN is available at https://github.com/Extreme-classification/MUFIN\n","authors":["Anshul Mittal","Kunal Dahiya","Shreya Malani","Janani Ramaswamy","Seba Kuruvilla","Jitendra Ajmera","Keng-hao Chang","Sumeet Agarwal","Purushottam Kar","Manik Varma"],"pdf_url":"https://arxiv.org/pdf/2309.04961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04949v1","updated":"2023-09-10T07:10:31Z","published":"2023-09-10T07:10:31Z","title":"A multiple k-means cluster ensemble framework for clustering citation\n trajectories","summary":" Citation maturity time varies for different articles. However, the impact of\nall articles is measured in a fixed window. Clustering their citation\ntrajectories helps understand the knowledge diffusion process and reveals that\nnot all articles gain immediate success after publication. Moreover, clustering\ntrajectories is necessary for paper impact recommendation algorithms. It is a\nchallenging problem because citation time series exhibit significant\nvariability due to non linear and non stationary characteristics. Prior works\npropose a set of arbitrary thresholds and a fixed rule based approach. All\nmethods are primarily parameter dependent. Consequently, it leads to\ninconsistencies while defining similar trajectories and ambiguities regarding\ntheir specific number. Most studies only capture extreme trajectories. Thus, a\ngeneralised clustering framework is required. This paper proposes a feature\nbased multiple k means cluster ensemble framework. 1,95,783 and 41,732 well\ncited articles from the Microsoft Academic Graph data are considered for\nclustering short term (10 year) and long term (30 year) trajectories,\nrespectively. It has linear run time. Four distinct trajectories are obtained\nEarly Rise Rapid Decline (2.2%), Early Rise Slow Decline (45%), Delayed Rise No\nDecline (53%), and Delayed Rise Slow Decline (0.8%). Individual trajectory\ndifferences for two different spans are studied. Most papers exhibit Early Rise\nSlow Decline and Delayed Rise No Decline patterns. The growth and decay times,\ncumulative citation distribution, and peak characteristics of individual\ntrajectories are redefined empirically. A detailed comparative study reveals\nour proposed methodology can detect all distinct trajectory classes.\n","authors":["Joyita Chakraborty","Dinesh K. Pradhan","Subrata Nandi"],"pdf_url":"https://arxiv.org/pdf/2309.04949v1.pdf","comment":"29 pages"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.05153v1","updated":"2023-09-10T22:05:24Z","published":"2023-09-10T22:05:24Z","title":"Learning Energy-Based Models by Cooperative Diffusion Recovery\n Likelihood","summary":" Training energy-based models (EBMs) with maximum likelihood estimation on\nhigh-dimensional data can be both challenging and time-consuming. As a result,\nthere a noticeable gap in sample quality between EBMs and other generative\nframeworks like GANs and diffusion models. To close this gap, inspired by the\nrecent efforts of learning EBMs by maximimizing diffusion recovery likelihood\n(DRL), we propose cooperative diffusion recovery likelihood (CDRL), an\neffective approach to tractably learn and sample from a series of EBMs defined\non increasingly noisy versons of a dataset, paired with an initializer model\nfor each EBM. At each noise level, the initializer model learns to amortize the\nsampling process of the EBM, and the two models are jointly estimated within a\ncooperative training framework. Samples from the initializer serve as starting\npoints that are refined by a few sampling steps from the EBM. With the refined\nsamples, the EBM is optimized by maximizing recovery likelihood, while the\ninitializer is optimized by learning from the difference between the refined\nsamples and the initial samples. We develop a new noise schedule and a variance\nreduction technique to further improve the sample quality. Combining these\nadvances, we significantly boost the FID scores compared to existing EBM\nmethods on CIFAR-10 and ImageNet 32x32, with a 2x speedup over DRL. In\naddition, we extend our method to compositional generation and image inpainting\ntasks, and showcase the compatibility of CDRL with classifier-free guidance for\nconditional generation, achieving similar trade-offs between sample quality and\nsample diversity as in diffusion models.\n","authors":["Yaxuan Zhu","Jianwen Xie","Yingnian Wu","Ruiqi Gao"],"pdf_url":"https://arxiv.org/pdf/2309.05153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05150v1","updated":"2023-09-10T21:54:03Z","published":"2023-09-10T21:54:03Z","title":"Faster, Lighter, More Accurate: A Deep Learning Ensemble for Content\n Moderation","summary":" To address the increasing need for efficient and accurate content moderation,\nwe propose an efficient and lightweight deep classification ensemble structure.\nOur approach is based on a combination of simple visual features, designed for\nhigh-accuracy classification of violent content with low false positives. Our\nensemble architecture utilizes a set of lightweight models with narrowed-down\ncolor features, and we apply it to both images and videos.\n We evaluated our approach using a large dataset of explosion and blast\ncontents and compared its performance to popular deep learning models such as\nResNet-50. Our evaluation results demonstrate significant improvements in\nprediction accuracy, while benefiting from 7.64x faster inference and lower\ncomputation cost.\n While our approach is tailored to explosion detection, it can be applied to\nother similar content moderation and violence detection use cases as well.\nBased on our experiments, we propose a \"think small, think many\" philosophy in\nclassification scenarios. We argue that transforming a single, large,\nmonolithic deep model into a verification-based step model ensemble of multiple\nsmall, simple, and lightweight models with narrowed-down visual features can\npossibly lead to predictions with higher accuracy.\n","authors":["Mohammad Hosseini","Mahmudul Hasan"],"pdf_url":"https://arxiv.org/pdf/2309.05150v1.pdf","comment":"6 pages, 22nd IEEE International Conference on Machine Learning and\n Applications (IEEE ICMLA'23), December 15-17, 2023, Jacksonville Riverfront,\n Florida, USA. arXiv admin note: substantial text overlap with\n arXiv:2103.10350"},{"id":"http://arxiv.org/abs/2309.05145v1","updated":"2023-09-10T21:36:38Z","published":"2023-09-10T21:36:38Z","title":"Outlier Robust Adversarial Training","summary":" Supervised learning models are challenged by the intrinsic complexities of\ntraining data such as outliers and minority subpopulations and intentional\nattacks at inference time with adversarial samples. While traditional robust\nlearning methods and the recent adversarial training approaches are designed to\nhandle each of the two challenges, to date, no work has been done to develop\nmodels that are robust with regard to the low-quality training data and the\npotential adversarial attack at inference time simultaneously. It is for this\nreason that we introduce Outlier Robust Adversarial Training (ORAT) in this\nwork. ORAT is based on a bi-level optimization formulation of adversarial\ntraining with a robust rank-based loss function. Theoretically, we show that\nthe learning objective of ORAT satisfies the $\\mathcal{H}$-consistency in\nbinary classification, which establishes it as a proper surrogate to\nadversarial 0/1 loss. Furthermore, we analyze its generalization ability and\nprovide uniform convergence rates in high probability. ORAT can be optimized\nwith a simple algorithm. Experimental evaluations on three benchmark datasets\ndemonstrate the effectiveness and robustness of ORAT in handling outliers and\nadversarial attacks. Our code is available at\nhttps://github.com/discovershu/ORAT.\n","authors":["Shu Hu","Zhenhuan Yang","Xin Wang","Yiming Ying","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2309.05145v1.pdf","comment":"Accepted by The 15th Asian Conference on Machine Learning (ACML 2023)"},{"id":"http://arxiv.org/abs/2305.11290v3","updated":"2023-09-10T21:08:35Z","published":"2023-05-18T20:14:28Z","title":"Massively Scalable Inverse Reinforcement Learning in Google Maps","summary":" Optimizing for humans' latent preferences remains a grand challenge in route\nrecommendation. Prior research has provided increasingly general techniques\nbased on inverse reinforcement learning (IRL), yet no approach has been\nsuccessfully scaled to world-sized routing problems with hundreds of millions\nof states and demonstration trajectories. In this paper, we provide methods for\nscaling IRL using graph compression, spatial parallelization, and problem\ninitialization based on dominant eigenvectors. We revisit classic algorithms\nand study them in a large-scale setting, and make the key observation that\nthere exists a trade-off between the use of cheap, deterministic planners and\nexpensive yet robust stochastic policies. We leverage this insight in Receding\nHorizon Inverse Planning (RHIP), a new generalization of classic IRL algorithms\nthat provides fine-grained control over performance trade-offs via its planning\nhorizon. Our contributions culminate in a policy that achieves a 16-24%\nimprovement in global route quality, and to the best of our knowledge,\nrepresents the largest instance of IRL in a real-world setting to date.\nBenchmark results show critical benefits to more sustainable modes of\ntransportation, where factors beyond journey time play a substantial role. We\nconclude by conducting an ablation study of key components, presenting negative\nresults from alternative eigenvalue solvers, and identifying opportunities to\nfurther improve scalability via IRL-specific batching strategies.\n","authors":["Matt Barnes","Matthew Abueg","Oliver F. Lange","Matt Deeds","Jason Trader","Denali Molitor","Markus Wulfmeier","Shawn O'Banion"],"pdf_url":"https://arxiv.org/pdf/2305.11290v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05132v1","updated":"2023-09-10T20:39:53Z","published":"2023-09-10T20:39:53Z","title":"DAD++: Improved Data-free Test Time Adversarial Defense","summary":" With the increasing deployment of deep neural networks in safety-critical\napplications such as self-driving cars, medical imaging, anomaly detection,\netc., adversarial robustness has become a crucial concern in the reliability of\nthese networks in real-world scenarios. A plethora of works based on\nadversarial training and regularization-based techniques have been proposed to\nmake these deep networks robust against adversarial attacks. However, these\nmethods require either retraining models or training them from scratch, making\nthem infeasible to defend pre-trained models when access to training data is\nrestricted. To address this problem, we propose a test time Data-free\nAdversarial Defense (DAD) containing detection and correction frameworks.\nMoreover, to further improve the efficacy of the correction framework in cases\nwhen the detector is under-confident, we propose a soft-detection scheme\n(dubbed as \"DAD++\"). We conduct a wide range of experiments and ablations on\nseveral datasets and network architectures to show the efficacy of our proposed\napproach. Furthermore, we demonstrate the applicability of our approach in\nimparting adversarial defense at test time under data-free (or data-efficient)\napplications/setups, such as Data-free Knowledge Distillation and Source-free\nUnsupervised Domain Adaptation, as well as Semi-supervised classification\nframeworks. We observe that in all the experiments and applications, our DAD++\ngives an impressive performance against various adversarial attacks with a\nminimal drop in clean accuracy. The source code is available at:\nhttps://github.com/vcl-iisc/Improved-Data-free-Test-Time-Adversarial-Defense\n","authors":["Gaurav Kumar Nayak","Inder Khatri","Shubham Randive","Ruchit Rawal","Anirban Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2309.05132v1.pdf","comment":"IJCV Journal (Under Review)"},{"id":"http://arxiv.org/abs/2306.13830v2","updated":"2023-09-10T20:31:55Z","published":"2023-06-24T01:14:48Z","title":"Improved Aircraft Environmental Impact Segmentation via Metric Learning","summary":" Accurate modeling of aircraft environmental impact is pivotal to the design\nof operational procedures and policies to mitigate negative aviation\nenvironmental impact. Aircraft environmental impact segmentation is a process\nwhich clusters aircraft types that have similar environmental impact\ncharacteristics based on a set of aircraft features. This practice helps model\na large population of aircraft types with insufficient aircraft noise and\nperformance models and contributes to better understanding of aviation\nenvironmental impact. Through measuring the similarity between aircraft types,\ndistance metric is the kernel of aircraft segmentation. Traditional ways of\naircraft segmentation use plain distance metrics and assign equal weight to all\nfeatures in an unsupervised clustering process. In this work, we utilize\nweakly-supervised metric learning and partial information on aircraft fuel\nburn, emissions, and noise to learn weighted distance metrics for aircraft\nenvironmental impact segmentation. We show in a comprehensive case study that\nthe tailored distance metrics can indeed make aircraft segmentation better\nreflect the actual environmental impact of aircraft. The metric learning\napproach can help refine a number of similar data-driven analytical studies in\naviation.\n","authors":["Zhenyu Gao","Dimitri N. Mavris"],"pdf_url":"https://arxiv.org/pdf/2306.13830v2.pdf","comment":"32 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.05131v1","updated":"2023-09-10T20:31:25Z","published":"2023-09-10T20:31:25Z","title":"Signal Temporal Logic Neural Predictive Control","summary":" Ensuring safety and meeting temporal specifications are critical challenges\nfor long-term robotic tasks. Signal temporal logic (STL) has been widely used\nto systematically and rigorously specify these requirements. However,\ntraditional methods of finding the control policy under those STL requirements\nare computationally complex and not scalable to high-dimensional or systems\nwith complex nonlinear dynamics. Reinforcement learning (RL) methods can learn\nthe policy to satisfy the STL specifications via hand-crafted or STL-inspired\nrewards, but might encounter unexpected behaviors due to ambiguity and sparsity\nin the reward. In this paper, we propose a method to directly learn a neural\nnetwork controller to satisfy the requirements specified in STL. Our controller\nlearns to roll out trajectories to maximize the STL robustness score in\ntraining. In testing, similar to Model Predictive Control (MPC), the learned\ncontroller predicts a trajectory within a planning horizon to ensure the\nsatisfaction of the STL requirement in deployment. A backup policy is designed\nto ensure safety when our controller fails. Our approach can adapt to various\ninitial conditions and environmental parameters. We conduct experiments on six\ntasks, where our method with the backup policy outperforms the classical\nmethods (MPC, STL-solver), model-free and model-based RL methods in STL\nsatisfaction rate, especially on tasks with complex STL specifications while\nbeing 10X-100X faster than the classical methods.\n","authors":["Yue Meng","Chuchu Fan"],"pdf_url":"https://arxiv.org/pdf/2309.05131v1.pdf","comment":"Accepted by IEEE Robotics and Automation Letters (RA-L) and ICRA2024"},{"id":"http://arxiv.org/abs/2309.05130v1","updated":"2023-09-10T20:30:03Z","published":"2023-09-10T20:30:03Z","title":"The online learning architecture with edge computing for high-level\n control for assisting patients","summary":" The prevalence of mobility impairments due to conditions such as spinal cord\ninjuries, strokes, and degenerative diseases is on the rise globally.\nLower-limb exoskeletons have been increasingly recognized as a viable solution\nfor enhancing mobility and rehabilitation for individuals with such\nimpairments. However, existing exoskeleton control systems often suffer from\nlimitations such as latency, lack of adaptability, and computational\ninefficiency. To address these challenges, this paper introduces a novel online\nadversarial learning architecture integrated with edge computing for high-level\nlower-limb exoskeleton control. In the proposed architecture, sensor data from\nthe user is processed in real-time through edge computing nodes, which then\ninteract with an online adversarial learning model. This model adapts to the\nuser's specific needs and controls the exoskeleton with minimal latency.\nExperimental evaluations demonstrate significant improvements in control\naccuracy and adaptability, as well as enhanced quality-of-service (QoS)\nmetrics. These findings indicate that the integration of online adversarial\nlearning with edge computing offers a robust and efficient approach for the\nnext generation of lower-limb exoskeleton control systems.\n","authors":["Yue Shi","Yihui Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05130v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2308.04650v2","updated":"2023-09-10T20:28:21Z","published":"2023-08-09T01:30:07Z","title":"Deep Metric Learning for the Hemodynamics Inference with\n Electrocardiogram Signals","summary":" Heart failure is a debilitating condition that affects millions of people\nworldwide and has a significant impact on their quality of life and mortality\nrates. An objective assessment of cardiac pressures remains an important method\nfor the diagnosis and treatment prognostication for patients with heart\nfailure. Although cardiac catheterization is the gold standard for estimating\ncentral hemodynamic pressures, it is an invasive procedure that carries\ninherent risks, making it a potentially dangerous procedure for some patients.\nApproaches that leverage non-invasive signals - such as electrocardiogram (ECG)\n- have the promise to make the routine estimation of cardiac pressures feasible\nin both inpatient and outpatient settings. Prior models trained to estimate\nintracardiac pressures (e.g., mean pulmonary capillary wedge pressure (mPCWP))\nin a supervised fashion have shown good discriminatory ability but have been\nlimited to the labeled dataset from the heart failure cohort. To address this\nissue and build a robust representation, we apply deep metric learning (DML)\nand propose a novel self-supervised DML with distance-based mining that\nimproves the performance of a model with limited labels. We use a dataset that\ncontains over 5.4 million ECGs without concomitant central pressure labels to\npre-train a self-supervised DML model which showed improved classification of\nelevated mPCWP compared to self-supervised contrastive baselines. Additionally,\nthe supervised DML model that uses ECGs with access to 8,172 mPCWP labels\ndemonstrated significantly better performance on the mPCWP regression task\ncompared to the supervised baseline. Moreover, our data suggest that DML yields\nmodels that are performant across patient subgroups, even when some patient\nsubgroups are under-represented in the dataset. Our code is available at\nhttps://github.com/mandiehyewon/ssldml\n","authors":["Hyewon Jeong","Collin M. Stultz","Marzyeh Ghassemi"],"pdf_url":"https://arxiv.org/pdf/2308.04650v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.15386v2","updated":"2023-09-10T20:14:39Z","published":"2020-07-30T11:24:05Z","title":"ResNet After All? Neural ODEs and Their Numerical Solution","summary":" A key appeal of the recently proposed Neural Ordinary Differential Equation\n(ODE) framework is that it seems to provide a continuous-time extension of\ndiscrete residual neural networks. As we show herein, though, trained Neural\nODE models actually depend on the specific numerical method used during\ntraining. If the trained model is supposed to be a flow generated from an ODE,\nit should be possible to choose another numerical solver with equal or smaller\nnumerical error without loss of performance. We observe that if training relies\non a solver with overly coarse discretization, then testing with another solver\nof equal or smaller numerical error results in a sharp drop in accuracy. In\nsuch cases, the combination of vector field and numerical method cannot be\ninterpreted as a flow generated from an ODE, which arguably poses a fatal\nbreakdown of the Neural ODE concept. We observe, however, that there exists a\ncritical step size beyond which the training yields a valid ODE vector field.\nWe propose a method that monitors the behavior of the ODE solver during\ntraining to adapt its step size, aiming to ensure a valid ODE without\nunnecessarily increasing computational cost. We verify this adaptation\nalgorithm on a common bench mark dataset as well as a synthetic dataset.\n","authors":["Katharina Ott","Prateek Katiyar","Philipp Hennig","Michael Tiemann"],"pdf_url":"https://arxiv.org/pdf/2007.15386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.04476v2","updated":"2023-09-10T20:04:30Z","published":"2022-11-08T19:00:00Z","title":"Discover, Explanation, Improvement: An Automatic Slice Detection\n Framework for Natural Language Processing","summary":" Pretrained natural language processing (NLP) models have achieved high\noverall performance, but they still make systematic errors. Instead of manual\nerror analysis, research on slice detection models (SDM), which automatically\nidentify underperforming groups of datapoints, has caught escalated attention\nin Computer Vision for both understanding model behaviors and providing\ninsights for future model training and designing. However, little research on\nSDM and quantitative evaluation of their effectiveness have been conducted on\nNLP tasks. Our paper fills the gap by proposing a benchmark named \"Discover,\nExplain, Improve (DEIM)\" for classification NLP tasks along with a new SDM\nEdisa. Edisa discovers coherent and underperforming groups of datapoints; DEIM\nthen unites them under human-understandable concepts and provides comprehensive\nevaluation tasks and corresponding quantitative metrics. The evaluation in DEIM\nshows that Edisa can accurately select error-prone datapoints with informative\nsemantic features that summarize error patterns. Detecting difficult datapoints\ndirectly boosts model performance without tuning any original model parameters,\nshowing that discovered slices are actionable for users.\n","authors":["Wenyue Hua","Lifeng Jin","Linfeng Song","Haitao Mi","Yongfeng Zhang","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2211.04476v2.pdf","comment":"15 pages, 5 figures, accepted by Transactions of the Association for\n Computational Linguistics"},{"id":"http://arxiv.org/abs/2301.08360v3","updated":"2023-09-10T19:18:30Z","published":"2023-01-19T23:36:23Z","title":"Domain-adapted Learning and Imitation: DRL for Power Arbitrage","summary":" In this paper, we discuss the Dutch power market, which is comprised of a\nday-ahead market and an intraday balancing market that operates like an\nauction. Due to fluctuations in power supply and demand, there is often an\nimbalance that leads to different prices in the two markets, providing an\nopportunity for arbitrage. To address this issue, we restructure the problem\nand propose a collaborative dual-agent reinforcement learning approach for this\nbi-level simulation and optimization of European power arbitrage trading. We\nalso introduce two new implementations designed to incorporate domain-specific\nknowledge by imitating the trading behaviours of power traders. By utilizing\nreward engineering to imitate domain expertise, we are able to reform the\nreward system for the RL agent, which improves convergence during training and\nenhances overall performance. Additionally, the tranching of orders increases\nbidding success rates and significantly boosts profit and loss (P&L). Our study\ndemonstrates that by leveraging domain expertise in a general learning problem,\nthe performance can be improved substantially, and the final integrated\napproach leads to a three-fold improvement in cumulative P&L compared to the\noriginal agent. Furthermore, our methodology outperforms the highest benchmark\npolicy by around 50% while maintaining efficient computational performance.\n","authors":["Yuanrong Wang","Vignesh Raja Swaminathan","Nikita P. Granger","Carlos Ros Perez","Christian Michler"],"pdf_url":"https://arxiv.org/pdf/2301.08360v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13248v2","updated":"2023-09-10T19:17:28Z","published":"2023-05-22T17:19:09Z","title":"Bayesian Numerical Integration with Neural Networks","summary":" Bayesian probabilistic numerical methods for numerical integration offer\nsignificant advantages over their non-Bayesian counterparts: they can encode\nprior information about the integrand, and can quantify uncertainty over\nestimates of an integral. However, the most popular algorithm in this class,\nBayesian quadrature, is based on Gaussian process models and is therefore\nassociated with a high computational cost. To improve scalability, we propose\nan alternative approach based on Bayesian neural networks which we call\nBayesian Stein networks. The key ingredients are a neural network architecture\nbased on Stein operators, and an approximation of the Bayesian posterior based\non the Laplace approximation. We show that this leads to orders of magnitude\nspeed-ups on the popular Genz functions benchmark, and on challenging problems\narising in the Bayesian analysis of dynamical systems, and the prediction of\nenergy production for a large-scale wind farm.\n","authors":["Katharina Ott","Michael Tiemann","Philipp Hennig","François-Xavier Briol"],"pdf_url":"https://arxiv.org/pdf/2305.13248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05107v1","updated":"2023-09-10T18:28:48Z","published":"2023-09-10T18:28:48Z","title":"Nonlinear Granger Causality using Kernel Ridge Regression","summary":" I introduce a novel algorithm and accompanying Python library, named\nmlcausality, designed for the identification of nonlinear Granger causal\nrelationships. This novel algorithm uses a flexible plug-in architecture that\nenables researchers to employ any nonlinear regressor as the base prediction\nmodel. Subsequently, I conduct a comprehensive performance analysis of\nmlcausality when the prediction regressor is the kernel ridge regressor with\nthe radial basis function kernel. The results demonstrate that mlcausality\nemploying kernel ridge regression achieves competitive AUC scores across a\ndiverse set of simulated data. Furthermore, mlcausality with kernel ridge\nregression yields more finely calibrated $p$-values in comparison to rival\nalgorithms. This enhancement enables mlcausality to attain superior accuracy\nscores when using intuitive $p$-value-based thresholding criteria. Finally,\nmlcausality with the kernel ridge regression exhibits significantly reduced\ncomputation times compared to existing nonlinear Granger causality algorithms.\nIn fact, in numerous instances, this innovative approach achieves superior\nsolutions within computational timeframes that are an order of magnitude\nshorter than those required by competing algorithms.\n","authors":["Wojciech \"Victor\" Fulmyk"],"pdf_url":"https://arxiv.org/pdf/2309.05107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05105v1","updated":"2023-09-10T18:24:43Z","published":"2023-09-10T18:24:43Z","title":"Convex Q Learning in a Stochastic Environment: Extended Version","summary":" The paper introduces the first formulation of convex Q-learning for Markov\ndecision processes with function approximation. The algorithms and theory rest\non a relaxation of a dual of Manne's celebrated linear programming\ncharacterization of optimal control. The main contributions firstly concern\nproperties of the relaxation, described as a deterministic convex program: we\nidentify conditions for a bounded solution, and a significant relationship\nbetween the solution to the new convex program, and the solution to standard\nQ-learning. The second set of contributions concern algorithm design and\nanalysis: (i) A direct model-free method for approximating the convex program\nfor Q-learning shares properties with its ideal. In particular, a bounded\nsolution is ensured subject to a simple property of the basis functions; (ii)\nThe proposed algorithms are convergent and new techniques are introduced to\nobtain the rate of convergence in a mean-square sense; (iii) The approach can\nbe generalized to a range of performance criteria, and it is found that\nvariance can be reduced by considering ``relative'' dynamic programming\nequations; (iv) The theory is illustrated with an application to a classical\ninventory control problem.\n","authors":["Fan Lu","Sean Meyn"],"pdf_url":"https://arxiv.org/pdf/2309.05105v1.pdf","comment":"Extended version of \"Convex Q-learning in a stochastic environment\",\n IEEE Conference on Decision and Control, 2023 (to appear)"},{"id":"http://arxiv.org/abs/2202.03558v2","updated":"2023-09-10T18:13:59Z","published":"2022-02-07T23:28:22Z","title":"Attacking c-MARL More Effectively: A Data Driven Approach","summary":" In recent years, a proliferation of methods were developed for cooperative\nmulti-agent reinforcement learning (c-MARL). However, the robustness of c-MARL\nagents against adversarial attacks has been rarely explored. In this paper, we\npropose to evaluate the robustness of c-MARL agents via a model-based approach,\nnamed c-MBA. Our proposed formulation can craft much stronger adversarial state\nperturbations of c-MARL agents to lower total team rewards than existing\nmodel-free approaches. In addition, we propose the first victim-agent selection\nstrategy and the first data-driven approach to define targeted failure states\nwhere each of them allows us to develop even stronger adversarial attack\nwithout the expert knowledge to the underlying environment. Our numerical\nexperiments on two representative MARL benchmarks illustrate the advantage of\nour approach over other baselines: our model-based attack consistently\noutperforms other baselines in all tested environments.\n","authors":["Nhan H. Pham","Lam M. Nguyen","Jie Chen","Hoang Thanh Lam","Subhro Das","Tsui-Wei Weng"],"pdf_url":"https://arxiv.org/pdf/2202.03558v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05102v1","updated":"2023-09-10T18:12:52Z","published":"2023-09-10T18:12:52Z","title":"Is Learning in Biological Neural Networks based on Stochastic Gradient\n Descent? An analysis using stochastic processes","summary":" In recent years, there has been an intense debate about how learning in\nbiological neural networks (BNNs) differs from learning in artificial neural\nnetworks. It is often argued that the updating of connections in the brain\nrelies only on local information, and therefore a stochastic gradient-descent\ntype optimization method cannot be used. In this paper, we study a stochastic\nmodel for supervised learning in BNNs. We show that a (continuous) gradient\nstep occurs approximately when each learning opportunity is processed by many\nlocal updates. This result suggests that stochastic gradient descent may indeed\nplay a role in optimizing BNNs.\n","authors":["Sören Christensen","Jan Kallsen"],"pdf_url":"https://arxiv.org/pdf/2309.05102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.09893v2","updated":"2023-09-10T18:06:56Z","published":"2023-02-20T10:40:29Z","title":"Efficient Generator of Mathematical Expressions for Symbolic Regression","summary":" We propose an approach to symbolic regression based on a novel variational\nautoencoder for generating hierarchical structures, HVAE. It combines simple\natomic units with shared weights to recursively encode and decode the\nindividual nodes in the hierarchy. Encoding is performed bottom-up and decoding\ntop-down. We empirically show that HVAE can be trained efficiently with small\ncorpora of mathematical expressions and can accurately encode expressions into\na smooth low-dimensional latent space. The latter can be efficiently explored\nwith various optimization methods to address the task of symbolic regression.\nIndeed, random search through the latent space of HVAE performs better than\nrandom search through expressions generated by manually crafted probabilistic\ngrammars for mathematical expressions. Finally, EDHiE system for symbolic\nregression, which applies an evolutionary algorithm to the latent space of\nHVAE, reconstructs equations from a standard symbolic regression benchmark\nbetter than a state-of-the-art system based on a similar combination of deep\nlearning and evolutionary algorithms.\\v{z}\n","authors":["Sebastian Mežnar","Sašo Džeroski","Ljupčo Todorovski"],"pdf_url":"https://arxiv.org/pdf/2302.09893v2.pdf","comment":"35 pages, 11 tables, 7 multi-part figures, Machine learning\n (Springer) and journal track of ECML/PKDD 2023"},{"id":"http://arxiv.org/abs/2308.11241v2","updated":"2023-09-10T17:43:52Z","published":"2023-08-22T07:34:07Z","title":"An Effective Transformer-based Contextual Model and Temporal Gate\n Pooling for Speaker Identification","summary":" Wav2vec2 has achieved success in applying Transformer architecture and\nself-supervised learning to speech recognition. Recently, these have come to be\nused not only for speech recognition but also for the entire speech processing.\nThis paper introduces an effective end-to-end speaker identification model\napplied Transformer-based contextual model. We explored the relationship\nbetween the hyper-parameters and the performance in order to discern the\nstructure of an effective model. Furthermore, we propose a pooling method,\nTemporal Gate Pooling, with powerful learning ability for speaker\nidentification. We applied Conformer as encoder and BEST-RQ for pre-training\nand conducted an evaluation utilizing the speaker identification of VoxCeleb1.\nThe proposed method has achieved an accuracy of 87.1% with 28.5M parameters,\ndemonstrating comparable precision to wav2vec2 with 317.7M parameters. Code is\navailable at https://github.com/HarunoriKawano/speaker-identification-with-tgp.\n","authors":["Harunori Kawano","Sota Shimizu"],"pdf_url":"https://arxiv.org/pdf/2308.11241v2.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.05092v1","updated":"2023-09-10T17:35:43Z","published":"2023-09-10T17:35:43Z","title":"Adaptive conformal classification with noisy labels","summary":" This paper develops novel conformal prediction methods for classification\ntasks that can automatically adapt to random label contamination in the\ncalibration sample, enabling more informative prediction sets with stronger\ncoverage guarantees compared to state-of-the-art approaches. This is made\npossible by a precise theoretical characterization of the effective coverage\ninflation (or deflation) suffered by standard conformal inferences in the\npresence of label contamination, which is then made actionable through new\ncalibration algorithms. Our solution is flexible and can leverage different\nmodeling assumptions about the label contamination process, while requiring no\nknowledge about the data distribution or the inner workings of the\nmachine-learning classifier. The advantages of the proposed methods are\ndemonstrated through extensive simulations and an application to object\nclassification with the CIFAR-10H image data set.\n","authors":["Matteo Sesia","Y. X. Rachel Wang","Xin Tong"],"pdf_url":"https://arxiv.org/pdf/2309.05092v1.pdf","comment":"35 pages (98 pages including references and appendices)"},{"id":"http://arxiv.org/abs/2309.01007v2","updated":"2023-09-10T17:10:05Z","published":"2023-09-02T19:02:50Z","title":"Comparative Analysis of Deep Learning Architectures for Breast Cancer\n Diagnosis Using the BreaKHis Dataset","summary":" Cancer is an extremely difficult and dangerous health problem because it\nmanifests in so many different ways and affects so many different organs and\ntissues. The primary goal of this research was to evaluate deep learning\nmodels' ability to correctly identify breast cancer cases using the BreakHis\ndataset. The BreakHis dataset covers a wide range of breast cancer subtypes\nthrough its huge collection of histopathological pictures. In this study, we\nuse and compare the performance of five well-known deep learning models for\ncancer classification: VGG, ResNet, Xception, Inception, and InceptionResNet.\nThe results placed the Xception model at the top, with an F1 score of 0.9 and\nan accuracy of 89%. At the same time, the Inception and InceptionResNet models\nboth hit accuracy of 87% . However, the F1 score for the Inception model was\n87, while that for the InceptionResNet model was 86. These results demonstrate\nthe importance of deep learning methods in making correct breast cancer\ndiagnoses. This highlights the potential to provide improved diagnostic\nservices to patients. The findings of this study not only improve current\nmethods of cancer diagnosis, but also make significant contributions to the\ncreation of new and improved cancer treatment strategies. In a nutshell, the\nresults of this study represent a major advancement in the direction of\nachieving these vital healthcare goals.\n","authors":["İrem Sayın","Muhammed Ali Soydaş","Yunus Emre Mert","Arda Yarkataş","Berk Ergun","Selma Sözen Yeh","Hüseyin Üvet"],"pdf_url":"https://arxiv.org/pdf/2309.01007v2.pdf","comment":"7 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2309.05079v1","updated":"2023-09-10T16:56:46Z","published":"2023-09-10T16:56:46Z","title":"A supervised generative optimization approach for tabular data","summary":" Synthetic data generation has emerged as a crucial topic for financial\ninstitutions, driven by multiple factors, such as privacy protection and data\naugmentation. Many algorithms have been proposed for synthetic data generation\nbut reaching the consensus on which method we should use for the specific data\nsets and use cases remains challenging. Moreover, the majority of existing\napproaches are ``unsupervised'' in the sense that they do not take into account\nthe downstream task. To address these issues, this work presents a novel\nsynthetic data generation framework. The framework integrates a supervised\ncomponent tailored to the specific downstream task and employs a meta-learning\napproach to learn the optimal mixture distribution of existing synthetic\ndistributions.\n","authors":["Fadi Hamad","Shinpei Nakamura-Sakai","Saheed Obitayo","Vamsi K. Potluru"],"pdf_url":"https://arxiv.org/pdf/2309.05079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05077v1","updated":"2023-09-10T16:55:59Z","published":"2023-09-10T16:55:59Z","title":"Generalization error bounds for iterative learning algorithms with\n bounded updates","summary":" This paper explores the generalization characteristics of iterative learning\nalgorithms with bounded updates for non-convex loss functions, employing\ninformation-theoretic techniques. Our key contribution is a novel bound for the\ngeneralization error of these algorithms with bounded updates, extending beyond\nthe scope of previous works that only focused on Stochastic Gradient Descent\n(SGD). Our approach introduces two main novelties: 1) we reformulate the mutual\ninformation as the uncertainty of updates, providing a new perspective, and 2)\ninstead of using the chaining rule of mutual information, we employ a variance\ndecomposition technique to decompose information across iterations, allowing\nfor a simpler surrogate process. We analyze our generalization bound under\nvarious settings and demonstrate improved bounds when the model dimension\nincreases at the same rate as the number of training data samples. To bridge\nthe gap between theory and practice, we also examine the previously observed\nscaling behavior in large language models. Ultimately, our work takes a further\nstep for developing practical generalization theories.\n","authors":["Jingwen Fu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.05077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05072v1","updated":"2023-09-10T16:35:47Z","published":"2023-09-10T16:35:47Z","title":"Spatiotemporal Graph Neural Networks with Uncertainty Quantification for\n Traffic Incident Risk Prediction","summary":" Predicting traffic incident risks at granular spatiotemporal levels is\nchallenging. The datasets predominantly feature zero values, indicating no\nincidents, with sporadic high-risk values for severe incidents. Notably, a\nmajority of current models, especially deep learning methods, focus solely on\nestimating risk values, overlooking the uncertainties arising from the\ninherently unpredictable nature of incidents. To tackle this challenge, we\nintroduce the Spatiotemporal Zero-Inflated Tweedie Graph Neural Networks\n(STZITD-GNNs). Our model merges the reliability of traditional statistical\nmodels with the flexibility of graph neural networks, aiming to precisely\nquantify uncertainties associated with road-level traffic incident risks. This\nmodel strategically employs a compound model from the Tweedie family, as a\nPoisson distribution to model risk frequency and a Gamma distribution to\naccount for incident severity. Furthermore, a zero-inflated component helps to\nidentify the non-incident risk scenarios. As a result, the STZITD-GNNs\neffectively capture the dataset's skewed distribution, placing emphasis on\ninfrequent but impactful severe incidents. Empirical tests using real-world\ntraffic data from London, UK, demonstrate that our model excels beyond current\nbenchmarks. The forte of STZITD-GNN resides not only in its accuracy but also\nin its adeptness at curtailing uncertainties, delivering robust predictions\nover short (7 days) and extended (14 days) timeframes.\n","authors":["Xiaowei Gao","Xinke Jiang","Dingyi Zhuang","Huanfa Chen","Shenhao Wang","James Haworth"],"pdf_url":"https://arxiv.org/pdf/2309.05072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05067v1","updated":"2023-09-10T16:18:49Z","published":"2023-09-10T16:18:49Z","title":"Mutation-based Fault Localization of Deep Neural Networks","summary":" Deep neural networks (DNNs) are susceptible to bugs, just like other types of\nsoftware systems. A significant uptick in using DNN, and its applications in\nwide-ranging areas, including safety-critical systems, warrant extensive\nresearch on software engineering tools for improving the reliability of\nDNN-based systems. One such tool that has gained significant attention in the\nrecent years is DNN fault localization. This paper revisits mutation-based\nfault localization in the context of DNN models and proposes a novel technique,\nnamed deepmufl, applicable to a wide range of DNN models. We have implemented\ndeepmufl and have evaluated its effectiveness using 109 bugs obtained from\nStackOverflow. Our results show that deepmufl detects 53/109 of the bugs by\nranking the buggy layer in top-1 position, outperforming state-of-the-art\nstatic and dynamic DNN fault localization systems that are also designed to\ntarget the class of bugs supported by deepmufl. Moreover, we observed that we\ncan halve the fault localization time for a pre-trained model using mutation\nselection, yet losing only 7.55% of the bugs localized in top-1 position.\n","authors":["Ali Ghanbari","Deepak-George Thomas","Muhammad Arbab Arshad","Hridesh Rajan"],"pdf_url":"https://arxiv.org/pdf/2309.05067v1.pdf","comment":"38th IEEE/ACM International Conference on Automated Software\n Engineering (ASE 2023)"},{"id":"http://arxiv.org/abs/2309.05063v1","updated":"2023-09-10T16:09:02Z","published":"2023-09-10T16:09:02Z","title":"Federated Learning Incentive Mechanism under Buyers' Auction Market","summary":" Auction-based Federated Learning (AFL) enables open collaboration among\nself-interested data consumers and data owners. Existing AFL approaches are\ncommonly under the assumption of sellers' market in that the service clients as\nsellers are treated as scarce resources so that the aggregation servers as\nbuyers need to compete the bids. Yet, as the technology progresses, an\nincreasing number of qualified clients are now capable of performing federated\nlearning tasks, leading to shift from sellers' market to a buyers' market. In\nthis paper, we shift the angle by adapting the procurement auction framework,\naiming to explain the pricing behavior under buyers' market. Our modeling\nstarts with basic setting under complete information, then move further to the\nscenario where sellers' information are not fully observable. In order to\nselect clients with high reliability and data quality, and to prevent from\nexternal attacks, we utilize a blockchain-based reputation mechanism. The\nexperimental results validate the effectiveness of our approach.\n","authors":["Jiaxi Yang","Zihao Guo","Sheng Cao","Cuifang Zhao","Li-Chuan Tsai"],"pdf_url":"https://arxiv.org/pdf/2309.05063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05062v1","updated":"2023-09-10T16:07:18Z","published":"2023-09-10T16:07:18Z","title":"Machine Learning for maximizing the memristivity of single and coupled\n quantum memristors","summary":" We propose machine learning (ML) methods to characterize the memristive\nproperties of single and coupled quantum memristors. We show that maximizing\nthe memristivity leads to large values in the degree of entanglement of two\nquantum memristors, unveiling the close relationship between quantum\ncorrelations and memory. Our results strengthen the possibility of using\nquantum memristors as key components of neuromorphic quantum computing.\n","authors":["Carlos Hernani-Morales","Gabriel Alvarado","Francisco Albarrán-Arriagada","Yolanda Vives-Gilabert","Enrique Solano","José D. Martín-Guerrero"],"pdf_url":"https://arxiv.org/pdf/2309.05062v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07247v4","updated":"2023-09-10T15:38:31Z","published":"2023-05-12T04:39:01Z","title":"Provably Convergent Schrödinger Bridge with Applications to\n Probabilistic Time Series Imputation","summary":" The Schr\\\"odinger bridge problem (SBP) is gaining increasing attention in\ngenerative modeling and showing promising potential even in comparison with the\nscore-based generative models (SGMs). SBP can be interpreted as an\nentropy-regularized optimal transport problem, which conducts projections onto\nevery other marginal alternatingly. However, in practice, only approximated\nprojections are accessible and their convergence is not well understood. To\nfill this gap, we present a first convergence analysis of the Schr\\\"odinger\nbridge algorithm based on approximated projections. As for its practical\napplications, we apply SBP to probabilistic time series imputation by\ngenerating missing values conditioned on observed data. We show that optimizing\nthe transport cost improves the performance and the proposed algorithm achieves\nthe state-of-the-art result in healthcare and environmental data while\nexhibiting the advantage of exploring both temporal and feature patterns in\nprobabilistic time series imputation.\n","authors":["Yu Chen","Wei Deng","Shikai Fang","Fengpei Li","Nicole Tianjiao Yang","Yikai Zhang","Kashif Rasul","Shandian Zhe","Anderson Schneider","Yuriy Nevmyvaka"],"pdf_url":"https://arxiv.org/pdf/2305.07247v4.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2211.01120v2","updated":"2023-09-10T14:29:22Z","published":"2022-11-02T13:54:07Z","title":"Variational Hierarchical Mixtures for Probabilistic Learning of Inverse\n Dynamics","summary":" Well-calibrated probabilistic regression models are a crucial learning\ncomponent in robotics applications as datasets grow rapidly and tasks become\nmore complex. Unfortunately, classical regression models are usually either\nprobabilistic kernel machines with a flexible structure that does not scale\ngracefully with data or deterministic and vastly scalable automata, albeit with\na restrictive parametric form and poor regularization. In this paper, we\nconsider a probabilistic hierarchical modeling paradigm that combines the\nbenefits of both worlds to deliver computationally efficient representations\nwith inherent complexity regularization. The presented approaches are\nprobabilistic interpretations of local regression techniques that approximate\nnonlinear functions through a set of local linear or polynomial units.\nImportantly, we rely on principles from Bayesian nonparametrics to formulate\nflexible models that adapt their complexity to the data and can potentially\nencompass an infinite number of components. We derive two efficient variational\ninference techniques to learn these representations and highlight the\nadvantages of hierarchical infinite local regression models, such as dealing\nwith non-smooth functions, mitigating catastrophic forgetting, and enabling\nparameter sharing and fast predictions. Finally, we validate this approach on\nlarge inverse dynamics datasets and test the learned models in real-world\ncontrol scenarios.\n","authors":["Hany Abdulsamad","Peter Nickl","Pascal Klink","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2211.01120v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2011.05217"},{"id":"http://arxiv.org/abs/2303.07925v8","updated":"2023-09-10T12:58:47Z","published":"2023-03-14T14:10:37Z","title":"Deep incremental learning models for financial temporal tabular datasets\n with distribution shifts","summary":" We present a robust deep incremental learning framework for regression tasks\non financial temporal tabular datasets which is built upon the incremental use\nof commonly available tabular and time series prediction models to adapt to\ndistributional shifts typical of financial datasets. The framework uses a\nsimple basic building block (decision trees) to build self-similar models of\nany required complexity to deliver robust performance under adverse situations\nsuch as regime changes, fat-tailed distributions, and low signal-to-noise\nratios. As a detailed study, we demonstrate our scheme using XGBoost models\ntrained on the Numerai dataset and show that a two layer deep ensemble of\nXGBoost models over different model snapshots delivers high quality predictions\nunder different market regimes. We also show that the performance of XGBoost\nmodels with different number of boosting rounds in three scenarios (small,\nstandard and large) is monotonically increasing with respect to model size and\nconverges towards the generalisation upper bound. We also evaluate the\nrobustness of the model under variability of different hyperparameters, such as\nmodel complexity and data sampling settings. Our model has low hardware\nrequirements as no specialised neural architectures are used and each base\nmodel can be independently trained in parallel.\n","authors":["Thomas Wong","Mauricio Barahona"],"pdf_url":"https://arxiv.org/pdf/2303.07925v8.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05019v1","updated":"2023-09-10T12:44:54Z","published":"2023-09-10T12:44:54Z","title":"SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models","summary":" Diffusion Probabilistic Models (DPMs) have achieved considerable success in\ngeneration tasks. As sampling from DPMs is equivalent to solving diffusion SDE\nor ODE which is time-consuming, numerous fast sampling methods built upon\nimproved differential equation solvers are proposed. The majority of such\ntechniques consider solving the diffusion ODE due to its superior efficiency.\nHowever, stochastic sampling could offer additional advantages in generating\ndiverse and high-quality data. In this work, we engage in a comprehensive\nanalysis of stochastic sampling from two aspects: variance-controlled diffusion\nSDE and linear multi-step SDE solver. Based on our analysis, we propose\nSA-Solver, which is an improved efficient stochastic Adams method for solving\ndiffusion SDE to generate data with high quality. Our experiments show that\nSA-Solver achieves: 1) improved or comparable performance compared with the\nexisting state-of-the-art sampling methods for few-step sampling; 2) SOTA FID\nscores on substantial benchmark datasets under a suitable number of function\nevaluations (NFEs).\n","authors":["Shuchen Xue","Mingyang Yi","Weijian Luo","Shifeng Zhang","Jiacheng Sun","Zhenguo Li","Zhi-Ming Ma"],"pdf_url":"https://arxiv.org/pdf/2309.05019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13793v3","updated":"2023-09-10T11:47:21Z","published":"2023-06-23T21:40:24Z","title":"QNNRepair: Quantized Neural Network Repair","summary":" We present QNNRepair, the first method in the literature for repairing\nquantized neural networks (QNNs). QNNRepair aims to improve the accuracy of a\nneural network model after quantization. It accepts the full-precision and\nweight-quantized neural networks and a repair dataset of passing and failing\ntests. At first, QNNRepair applies a software fault localization method to\nidentify the neurons that cause performance degradation during neural network\nquantization. Then, it formulates the repair problem into a linear programming\nproblem of solving neuron weights parameters, which corrects the QNN's\nperformance on failing tests while not compromising its performance on passing\ntests. We evaluate QNNRepair with widely used neural network architectures such\nas MobileNetV2, ResNet, and VGGNet on popular datasets, including\nhigh-resolution images. We also compare QNNRepair with the state-of-the-art\ndata-free quantization method SQuant. According to the experiment results, we\nconclude that QNNRepair is effective in improving the quantized model's\nperformance in most cases. Its repaired models have 24% higher accuracy than\nSQuant's in the independent validation set, especially for the ImageNet\ndataset.\n","authors":["Xidan Song","Youcheng Sun","Mustafa A. Mustafa","Lucas C. Cordeiro"],"pdf_url":"https://arxiv.org/pdf/2306.13793v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03787v2","updated":"2023-09-10T11:41:42Z","published":"2023-03-07T10:48:20Z","title":"Sample-efficient Real-time Planning with Curiosity Cross-Entropy Method\n and Contrastive Learning","summary":" Model-based reinforcement learning (MBRL) with real-time planning has shown\ngreat potential in locomotion and manipulation control tasks. However, the\nexisting planning methods, such as the Cross-Entropy Method (CEM), do not scale\nwell to complex high-dimensional environments. One of the key reasons for\nunderperformance is the lack of exploration, as these planning methods only aim\nto maximize the cumulative extrinsic reward over the planning horizon.\nFurthermore, planning inside the compact latent space in the absence of\nobservations makes it challenging to use curiosity-based intrinsic motivation.\nWe propose Curiosity CEM (CCEM), an improved version of the CEM algorithm for\nencouraging exploration via curiosity. Our proposed method maximizes the sum of\nstate-action Q values over the planning horizon, in which these Q values\nestimate the future extrinsic and intrinsic reward, hence encouraging reaching\nnovel observations. In addition, our model uses contrastive representation\nlearning to efficiently learn latent representations. Experiments on\nimage-based continuous control tasks from the DeepMind Control suite show that\nCCEM is by a large margin more sample-efficient than previous MBRL algorithms\nand compares favorably with the best model-free RL methods.\n","authors":["Mostafa Kotb","Cornelius Weber","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2303.03787v2.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.04980v1","updated":"2023-09-10T10:08:52Z","published":"2023-09-10T10:08:52Z","title":"Linear Speedup of Incremental Aggregated Gradient Methods on Streaming\n Data","summary":" This paper considers a type of incremental aggregated gradient (IAG) method\nfor large-scale distributed optimization. The IAG method is well suited for the\nparameter server architecture as the latter can easily aggregate potentially\nstaled gradients contributed by workers. Although the convergence of IAG in the\ncase of deterministic gradient is well known, there are only a few results for\nthe case of its stochastic variant based on streaming data. Considering\nstrongly convex optimization, this paper shows that the streaming IAG method\nachieves linear speedup when the workers are updating frequently enough, even\nif the data sample distribution across workers are heterogeneous. We show that\nthe expected squared distance to optimal solution decays at O((1+T)/(nt)),\nwhere $n$ is the number of workers, t is the iteration number, and T/n is the\nupdate frequency of workers. Our analysis involves careful treatments of the\nconditional expectations with staled gradients and a recursive system with both\ndelayed and noise terms, which are new to the analysis of IAG-type algorithms.\nNumerical results are presented to verify our findings.\n","authors":["Xiaolu Wang","Cheng Jin","Hoi-To Wai","Yuantao Gu"],"pdf_url":"https://arxiv.org/pdf/2309.04980v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.04976v1","updated":"2023-09-10T09:40:20Z","published":"2023-09-10T09:40:20Z","title":"AVARS -- Alleviating Unexpected Urban Road Traffic Congestion using UAVs","summary":" Reducing unexpected urban traffic congestion caused by en-route events (e.g.,\nroad closures, car crashes, etc.) often requires fast and accurate reactions to\nchoose the best-fit traffic signals. Traditional traffic light control systems,\nsuch as SCATS and SCOOT, are not efficient as their traffic data provided by\ninduction loops has a low update frequency (i.e., longer than 1 minute).\nMoreover, the traffic light signal plans used by these systems are selected\nfrom a limited set of candidate plans pre-programmed prior to unexpected\nevents' occurrence. Recent research demonstrates that camera-based traffic\nlight systems controlled by deep reinforcement learning (DRL) algorithms are\nmore effective in reducing traffic congestion, in which the cameras can provide\nhigh-frequency high-resolution traffic data. However, these systems are costly\nto deploy in big cities due to the excessive potential upgrades required to\nroad infrastructure. In this paper, we argue that Unmanned Aerial Vehicles\n(UAVs) can play a crucial role in dealing with unexpected traffic congestion\nbecause UAVs with onboard cameras can be economically deployed when and where\nunexpected congestion occurs. Then, we propose a system called \"AVARS\" that\nexplores the potential of using UAVs to reduce unexpected urban traffic\ncongestion using DRL-based traffic light signal control. This approach is\nvalidated on a widely used open-source traffic simulator with practical UAV\nsettings, including its traffic monitoring ranges and battery lifetime. Our\nsimulation results show that AVARS can effectively recover the unexpected\ntraffic congestion in Dublin, Ireland, back to its original un-congested level\nwithin the typical battery life duration of a UAV.\n","authors":["Jiaying Guo","Michael R. Jones","Soufiene Djahel","Shen Wang"],"pdf_url":"https://arxiv.org/pdf/2309.04976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04974v1","updated":"2023-09-10T09:32:35Z","published":"2023-09-10T09:32:35Z","title":"Continual Robot Learning using Self-Supervised Task Inference","summary":" Endowing robots with the human ability to learn a growing set of skills over\nthe course of a lifetime as opposed to mastering single tasks is an open\nproblem in robot learning. While multi-task learning approaches have been\nproposed to address this problem, they pay little attention to task inference.\nIn order to continually learn new tasks, the robot first needs to infer the\ntask at hand without requiring predefined task representations. In this paper,\nwe propose a self-supervised task inference approach. Our approach learns\naction and intention embeddings from self-organization of the observed movement\nand effect parts of unlabeled demonstrations and a higher-level behavior\nembedding from self-organization of the joint action-intention embeddings. We\nconstruct a behavior-matching self-supervised learning objective to train a\nnovel Task Inference Network (TINet) to map an unlabeled demonstration to its\nnearest behavior embedding, which we use as the task representation. A\nmulti-task policy is built on top of the TINet and trained with reinforcement\nlearning to optimize performance over tasks. We evaluate our approach in the\nfixed-set and continual multi-task learning settings with a humanoid robot and\ncompare it to different multi-task learning baselines. The results show that\nour approach outperforms the other baselines, with the difference being more\npronounced in the challenging continual learning setting, and can infer tasks\nfrom incomplete demonstrations. Our approach is also shown to generalize to\nunseen tasks based on a single demonstration in one-shot task generalization\nexperiments.\n","authors":["Muhammad Burhan Hafez","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2309.04974v1.pdf","comment":"Accepted for publication in IEEE Transactions on Cognitive and\n Developmental Systems"},{"id":"http://arxiv.org/abs/2309.04968v1","updated":"2023-09-10T09:03:53Z","published":"2023-09-10T09:03:53Z","title":"LMBiS-Net: A Lightweight Multipath Bidirectional Skip Connection based\n CNN for Retinal Blood Vessel Segmentation","summary":" Blinding eye diseases are often correlated with altered retinal morphology,\nwhich can be clinically identified by segmenting retinal structures in fundus\nimages. However, current methodologies often fall short in accurately\nsegmenting delicate vessels. Although deep learning has shown promise in\nmedical image segmentation, its reliance on repeated convolution and pooling\noperations can hinder the representation of edge information, ultimately\nlimiting overall segmentation accuracy. In this paper, we propose a lightweight\npixel-level CNN named LMBiS-Net for the segmentation of retinal vessels with an\nexceptionally low number of learnable parameters \\textbf{(only 0.172 M)}. The\nnetwork used multipath feature extraction blocks and incorporates bidirectional\nskip connections for the information flow between the encoder and decoder.\nAdditionally, we have optimized the efficiency of the model by carefully\nselecting the number of filters to avoid filter overlap. This optimization\nsignificantly reduces training time and enhances computational efficiency. To\nassess the robustness and generalizability of LMBiS-Net, we performed\ncomprehensive evaluations on various aspects of retinal images. Specifically,\nthe model was subjected to rigorous tests to accurately segment retinal\nvessels, which play a vital role in ophthalmological diagnosis and treatment.\nBy focusing on the retinal blood vessels, we were able to thoroughly analyze\nthe performance and effectiveness of the LMBiS-Net model. The results of our\ntests demonstrate that LMBiS-Net is not only robust and generalizable but also\ncapable of maintaining high levels of segmentation accuracy. These\ncharacteristics highlight the potential of LMBiS-Net as an efficient tool for\nhigh-speed and accurate segmentation of retinal images in various clinical\napplications.\n","authors":["Mufassir M. Abbasi","Shahzaib Iqbal","Asim Naveed","Tariq M. Khan","Syed S. Naqvi","Wajeeha Khalid"],"pdf_url":"https://arxiv.org/pdf/2309.04968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04949v1","updated":"2023-09-10T07:10:31Z","published":"2023-09-10T07:10:31Z","title":"A multiple k-means cluster ensemble framework for clustering citation\n trajectories","summary":" Citation maturity time varies for different articles. However, the impact of\nall articles is measured in a fixed window. Clustering their citation\ntrajectories helps understand the knowledge diffusion process and reveals that\nnot all articles gain immediate success after publication. Moreover, clustering\ntrajectories is necessary for paper impact recommendation algorithms. It is a\nchallenging problem because citation time series exhibit significant\nvariability due to non linear and non stationary characteristics. Prior works\npropose a set of arbitrary thresholds and a fixed rule based approach. All\nmethods are primarily parameter dependent. Consequently, it leads to\ninconsistencies while defining similar trajectories and ambiguities regarding\ntheir specific number. Most studies only capture extreme trajectories. Thus, a\ngeneralised clustering framework is required. This paper proposes a feature\nbased multiple k means cluster ensemble framework. 1,95,783 and 41,732 well\ncited articles from the Microsoft Academic Graph data are considered for\nclustering short term (10 year) and long term (30 year) trajectories,\nrespectively. It has linear run time. Four distinct trajectories are obtained\nEarly Rise Rapid Decline (2.2%), Early Rise Slow Decline (45%), Delayed Rise No\nDecline (53%), and Delayed Rise Slow Decline (0.8%). Individual trajectory\ndifferences for two different spans are studied. Most papers exhibit Early Rise\nSlow Decline and Delayed Rise No Decline patterns. The growth and decay times,\ncumulative citation distribution, and peak characteristics of individual\ntrajectories are redefined empirically. A detailed comparative study reveals\nour proposed methodology can detect all distinct trajectory classes.\n","authors":["Joyita Chakraborty","Dinesh K. Pradhan","Subrata Nandi"],"pdf_url":"https://arxiv.org/pdf/2309.04949v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2309.04941v1","updated":"2023-09-10T06:13:29Z","published":"2023-09-10T06:13:29Z","title":"Distance-Restricted Folklore Weisfeiler-Leman GNNs with Provable Cycle\n Counting Power","summary":" The ability of graph neural networks (GNNs) to count certain graph\nsubstructures, especially cycles, is important for the success of GNNs on a\nwide range of tasks. It has been recently used as a popular metric for\nevaluating the expressive power of GNNs. Many of the proposed GNN models with\nprovable cycle counting power are based on subgraph GNNs, i.e., extracting a\nbag of subgraphs from the input graph, generating representations for each\nsubgraph, and using them to augment the representation of the input graph.\nHowever, those methods require heavy preprocessing, and suffer from high time\nand memory costs. In this paper, we overcome the aforementioned limitations of\nsubgraph GNNs by proposing a novel class of GNNs -- $d$-Distance-Restricted\nFWL(2) GNNs, or $d$-DRFWL(2) GNNs. $d$-DRFWL(2) GNNs use node pairs whose\nmutual distances are at most $d$ as the units for message passing to balance\nthe expressive power and complexity. By performing message passing among\ndistance-restricted node pairs in the original graph, $d$-DRFWL(2) GNNs avoid\nthe expensive subgraph extraction operations in subgraph GNNs, making both the\ntime and space complexity lower. We theoretically show that the discriminative\npower of $d$-DRFWL(2) GNNs strictly increases as $d$ increases. More\nimportantly, $d$-DRFWL(2) GNNs have provably strong cycle counting power even\nwith $d=2$: they can count all 3, 4, 5, 6-cycles. Since 6-cycles (e.g., benzene\nrings) are ubiquitous in organic molecules, being able to detect and count them\nis crucial for achieving robust and generalizable performance on molecular\ntasks. Experiments on both synthetic datasets and molecular datasets verify our\ntheory. To the best of our knowledge, our model is the most efficient GNN model\nto date (both theoretically and empirically) that can count up to 6-cycles.\n","authors":["Junru Zhou","Jiarui Feng","Xiyuan Wang","Muhan Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.04941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13156v2","updated":"2023-09-10T05:47:11Z","published":"2023-02-25T20:54:02Z","title":"Why Do Facial Deepfake Detectors Fail?","summary":" Recent rapid advancements in deepfake technology have allowed the creation of\nhighly realistic fake media, such as video, image, and audio. These materials\npose significant challenges to human authentication, such as impersonation,\nmisinformation, or even a threat to national security. To keep pace with these\nrapid advancements, several deepfake detection algorithms have been proposed,\nleading to an ongoing arms race between deepfake creators and deepfake\ndetectors. Nevertheless, these detectors are often unreliable and frequently\nfail to detect deepfakes. This study highlights the challenges they face in\ndetecting deepfakes, including (1) the pre-processing pipeline of artifacts and\n(2) the fact that generators of new, unseen deepfake samples have not been\nconsidered when building the defense models. Our work sheds light on the need\nfor further research and development in this field to create more robust and\nreliable detectors.\n","authors":["Binh Le","Shahroz Tariq","Alsharif Abuadbba","Kristen Moore","Simon Woo"],"pdf_url":"https://arxiv.org/pdf/2302.13156v2.pdf","comment":"5 pages, ACM ASIACCS 2023"},{"id":"http://arxiv.org/abs/2309.03808v2","updated":"2023-09-10T05:25:26Z","published":"2023-09-07T16:01:47Z","title":"Improved theoretical guarantee for rank aggregation via spectral method","summary":" Given pairwise comparisons between multiple items, how to rank them so that\nthe ranking matches the observations? This problem, known as rank aggregation,\nhas found many applications in sports, recommendation systems, and other web\napplications. As it is generally NP-hard to find a global ranking that\nminimizes the mismatch (known as the Kemeny optimization), we focus on the\nErd\\\"os-R\\'enyi outliers (ERO) model for this ranking problem. Here, each\npairwise comparison is a corrupted copy of the true score difference. We\ninvestigate spectral ranking algorithms that are based on unnormalized and\nnormalized data matrices. The key is to understand their performance in\nrecovering the underlying scores of each item from the observed data. This\nreduces to deriving an entry-wise perturbation error bound between the top\neigenvectors of the unnormalized/normalized data matrix and its population\ncounterpart. By using the leave-one-out technique, we provide a sharper\n$\\ell_{\\infty}$-norm perturbation bound of the eigenvectors and also derive an\nerror bound on the maximum displacement for each item, with only $\\Omega(n\\log\nn)$ samples. Our theoretical analysis improves upon the state-of-the-art\nresults in terms of sample complexity, and our numerical experiments confirm\nthese theoretical findings.\n","authors":["Ziliang Samuel Zhong","Shuyang Ling"],"pdf_url":"https://arxiv.org/pdf/2309.03808v2.pdf","comment":"29 pages, 6 figures"},{"id":"http://arxiv.org/abs/2303.14612v2","updated":"2023-09-10T01:58:54Z","published":"2023-03-26T03:20:44Z","title":"Deepfake in the Metaverse: Security Implications for Virtual Gaming,\n Meetings, and Offices","summary":" The metaverse has gained significant attention from various industries due to\nits potential to create a fully immersive and interactive virtual world.\nHowever, the integration of deepfakes in the metaverse brings serious security\nimplications, particularly with regard to impersonation. This paper examines\nthe security implications of deepfakes in the metaverse, specifically in the\ncontext of gaming, online meetings, and virtual offices. The paper discusses\nhow deepfakes can be used to impersonate in gaming scenarios, how online\nmeetings in the metaverse open the door for impersonation, and how virtual\noffices in the metaverse lack physical authentication, making it easier for\nattackers to impersonate someone. The implications of these security concerns\nare discussed in relation to the confidentiality, integrity, and availability\n(CIA) triad. The paper further explores related issues such as the darkverse,\nand digital cloning, as well as regulatory and privacy concerns associated with\naddressing security threats in the virtual world.\n","authors":["Shahroz Tariq","Alsharif Abuadbba","Kristen Moore"],"pdf_url":"https://arxiv.org/pdf/2303.14612v2.pdf","comment":"3 pages. Published to ACM ASIACCS 2023 workshop - The 2nd security\n implications of Deepfakes and Cheapfakes"},{"id":"http://arxiv.org/abs/2309.04911v1","updated":"2023-09-10T01:52:23Z","published":"2023-09-10T01:52:23Z","title":"A Review of Machine Learning-based Security in Cloud Computing","summary":" Cloud Computing (CC) is revolutionizing the way IT resources are delivered to\nusers, allowing them to access and manage their systems with increased\ncost-effectiveness and simplified infrastructure. However, with the growth of\nCC comes a host of security risks, including threats to availability,\nintegrity, and confidentiality. To address these challenges, Machine Learning\n(ML) is increasingly being used by Cloud Service Providers (CSPs) to reduce the\nneed for human intervention in identifying and resolving security issues. With\nthe ability to analyze vast amounts of data, and make high-accuracy\npredictions, ML can transform the way CSPs approach security. In this paper, we\nwill explore some of the most recent research in the field of ML-based security\nin Cloud Computing. We will examine the features and effectiveness of a range\nof ML algorithms, highlighting their unique strengths and potential\nlimitations. Our goal is to provide a comprehensive overview of the current\nstate of ML in cloud security and to shed light on the exciting possibilities\nthat this emerging field has to offer.\n","authors":["Aptin Babaei","Parham M. Kebria","Mohsen Moradi Dalvand","Saeid Nahavandi"],"pdf_url":"https://arxiv.org/pdf/2309.04911v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.13066v2","updated":"2023-09-10T00:52:57Z","published":"2023-08-24T20:22:22Z","title":"Objective-Agnostic Enhancement of Molecule Properties via Multi-Stage\n VAE","summary":" Variational autoencoder (VAE) is a popular method for drug discovery and\nvarious architectures and pipelines have been proposed to improve its\nperformance. However, VAE approaches are known to suffer from poor manifold\nrecovery when the data lie on a low-dimensional manifold embedded in a higher\ndimensional ambient space [Dai and Wipf, 2019]. The consequences of it in drug\ndiscovery are somewhat under-explored. In this paper, we explore applying a\nmulti-stage VAE approach, that can improve manifold recovery on a synthetic\ndataset, to the field of drug discovery. We experimentally evaluate our\nmulti-stage VAE approach using the ChEMBL dataset and demonstrate its ability\nto improve the property statistics of generated molecules substantially from\npre-existing methods without incorporating property predictors into the\ntraining pipeline. We further fine-tune our models on two curated and much\nsmaller molecule datasets that target different proteins. Our experiments show\nan increase in the number of active molecules generated by the multi-stage VAE\nin comparison to their one-stage equivalent. For each of the two tasks, our\nbaselines include methods that use learned property predictors to incorporate\ntarget metrics directly into the training objective and we discuss\ncomplications that arise with this methodology.\n","authors":["Chenghui Zhou","Barnabas Poczos"],"pdf_url":"https://arxiv.org/pdf/2308.13066v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2212.02750"},{"id":"http://arxiv.org/abs/2110.04763v2","updated":"2023-09-10T00:37:05Z","published":"2021-10-10T11:21:08Z","title":"Fat-Shattering Dimension of $k$-fold Aggregations","summary":" We provide estimates on the fat-shattering dimension of aggregation rules of\nreal-valued function classes. The latter consists of all ways of choosing $k$\nfunctions, one from each of the $k$ classes, and computing a pointwise function\nof them, such as the median, mean, and maximum. The bound is stated in terms of\nthe fat-shattering dimensions of the component classes. For linear and affine\nfunction classes, we provide a considerably sharper upper bound and a matching\nlower bound, achieving, in particular, an optimal dependence on $k$. Along the\nway, we improve several known results in addition to pointing out and\ncorrecting a number of erroneous claims in the literature.\n","authors":["Idan Attias","Aryeh Kontorovich"],"pdf_url":"https://arxiv.org/pdf/2110.04763v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.05091v1","updated":"2023-09-10T17:34:40Z","published":"2023-09-10T17:34:40Z","title":"SpeechMirror: A Multimodal Visual Analytics System for Personalized\n Reflection of Online Public Speaking Effectiveness","summary":" As communications are increasingly taking place virtually, the ability to\npresent well online is becoming an indispensable skill. Online speakers are\nfacing unique challenges in engaging with remote audiences. However, there has\nbeen a lack of evidence-based analytical systems for people to comprehensively\nevaluate online speeches and further discover possibilities for improvement.\nThis paper introduces SpeechMirror, a visual analytics system facilitating\nreflection on a speech based on insights from a collection of online speeches.\nThe system estimates the impact of different speech techniques on effectiveness\nand applies them to a speech to give users awareness of the performance of\nspeech techniques. A similarity recommendation approach based on speech factors\nor script content supports guided exploration to expand knowledge of\npresentation evidence and accelerate the discovery of speech delivery\npossibilities. SpeechMirror provides intuitive visualizations and interactions\nfor users to understand speech factors. Among them, SpeechTwin, a novel\nmultimodal visual summary of speech, supports rapid understanding of critical\nspeech factors and comparison of different speech samples, and SpeechPlayer\naugments the speech video by integrating visualization of the speaker's body\nlanguage with interaction, for focused analysis. The system utilizes\nvisualizations suited to the distinct nature of different speech factors for\nuser comprehension. The proposed system and visualization techniques were\nevaluated with domain experts and amateurs, demonstrating usability for users\nwith low visualization literacy and its efficacy in assisting users to develop\ninsights for potential improvement.\n","authors":["Zeyuan Huang","Qiang He","Kevin Maher","Xiaoming Deng","Yu-Kun Lai","Cuixia Ma","Sheng-feng Qin","Yong-Jin Liu","Hongan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05091v1.pdf","comment":"Main paper (11 pages, 6 figures) and Supplemental document (11 pages,\n 11 figures). Accepted by VIS 2023"},{"id":"http://arxiv.org/abs/2309.05058v1","updated":"2023-09-10T15:52:56Z","published":"2023-09-10T15:52:56Z","title":"Multimodal Fish Feeding Intensity Assessment in Aquaculture","summary":" Fish feeding intensity assessment (FFIA) aims to evaluate the intensity\nchange of fish appetite during the feeding process, which is vital in\nindustrial aquaculture applications. The main challenges surrounding FFIA are\ntwo-fold. 1) robustness: existing work has mainly leveraged single-modality\n(e.g., vision, audio) methods, which have a high sensitivity to input noise. 2)\nefficiency: FFIA models are generally expected to be employed on devices. This\npresents a challenge in terms of computational efficiency. In this work, we\nfirst introduce an audio-visual dataset, called AV-FFIA. AV-FFIA consists of\n27,000 labeled audio and video clips that capture different levels of fish\nfeeding intensity. To our knowledge, AV-FFIA is the first large-scale\nmultimodal dataset for FFIA research. Then, we introduce a multi-modal approach\nfor FFIA by leveraging single-modality pre-trained models and modality-fusion\nmethods, with benchmark studies on AV-FFIA. Our experimental results indicate\nthat the multi-modal approach substantially outperforms the single-modality\nbased approach, especially in noisy environments. While multimodal approaches\nprovide a performance gain for FFIA, it inherently increase the computational\ncost. To overcome this issue, we further present a novel unified model, termed\nas U-FFIA. U-FFIA is a single model capable of processing audio, visual, or\naudio-visual modalities, by leveraging modality dropout during training and\nknowledge distillation from single-modality pre-trained models. We demonstrate\nthat U-FFIA can achieve performance better than or on par with the\nstate-of-the-art modality-specific FFIA models, with significantly lower\ncomputational overhead. Our proposed U-FFIA approach enables a more robust and\nefficient method for FFIA, with the potential to contribute to improved\nmanagement practices and sustainability in aquaculture.\n","authors":["Meng Cui","Xubo Liu","Haohe Liu","Zhuangzhuang Du","Tao Chen","Guoping Lian","Daoliang Li","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05026v1","updated":"2023-09-10T13:40:15Z","published":"2023-09-10T13:40:15Z","title":"Spatial Perceptual Quality Aware Adaptive Volumetric Video Streaming","summary":" Volumetric video offers a highly immersive viewing experience, but poses\nchallenges in ensuring quality of experience (QoE) due to its high bandwidth\nrequirements. In this paper, we explore the effect of viewing distance\nintroduced by six degrees of freedom (6DoF) spatial navigation on user's\nperceived quality. By considering human visual resolution limitations, we\npropose a visual acuity model that describes the relationship between the\nvirtual viewing distance and the tolerable boundary point cloud density. The\nproposed model satisfies spatial visual requirements during 6DoF exploration.\nAdditionally, it dynamically adjusts quality levels to balance perceptual\nquality and bandwidth consumption. Furthermore, we present a QoE model to\nrepresent user's perceived quality at different viewing distances precisely.\nExtensive experimental results demonstrate that, the proposed scheme can\neffectively improve the overall average QoE by up to 26% over real networks and\nuser traces, compared to existing baselines.\n","authors":["Xi Wang","Wei Liu","Huitong Liu","Peng Yang"],"pdf_url":"https://arxiv.org/pdf/2309.05026v1.pdf","comment":"Accepted byIEEE Globecom 2023"},{"id":"http://arxiv.org/abs/2309.02567v2","updated":"2023-09-10T12:36:04Z","published":"2023-09-05T20:27:31Z","title":"Symbolic Music Representations for Classification Tasks: A Systematic\n Evaluation","summary":" Music Information Retrieval (MIR) has seen a recent surge in deep\nlearning-based approaches, which often involve encoding symbolic music (i.e.,\nmusic represented in terms of discrete note events) in an image-like or\nlanguage like fashion. However, symbolic music is neither an image nor a\nsentence, and research in the symbolic domain lacks a comprehensive overview of\nthe different available representations. In this paper, we investigate matrix\n(piano roll), sequence, and graph representations and their corresponding\nneural architectures, in combination with symbolic scores and performances on\nthree piece-level classification tasks. We also introduce a novel graph\nrepresentation for symbolic performances and explore the capability of graph\nrepresentations in global classification tasks. Our systematic evaluation shows\nadvantages and limitations of each input representation. Our results suggest\nthat the graph representation, as the newest and least explored among the three\napproaches, exhibits promising performance, while being more light-weight in\ntraining.\n","authors":["Huan Zhang","Emmanouil Karystinaios","Simon Dixon","Gerhard Widmer","Carlos Eduardo Cancino-Chacón"],"pdf_url":"https://arxiv.org/pdf/2309.02567v2.pdf","comment":"To be published in the Proceedings of the 24th International Society\n for Music Information Retrieval Conference (ISMIR 2023), Milan, Italy"}]},"2023-09-09T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.04862v1","updated":"2023-09-09T19:01:59Z","published":"2023-09-09T19:01:59Z","title":"Distributional Data Augmentation Methods for Low Resource Language","summary":" Text augmentation is a technique for constructing synthetic data from an\nunder-resourced corpus to improve predictive performance. Synthetic data\ngeneration is common in numerous domains. However, recently text augmentation\nhas emerged in natural language processing (NLP) to improve downstream tasks.\nOne of the current state-of-the-art text augmentation techniques is easy data\naugmentation (EDA), which augments the training data by injecting and replacing\nsynonyms and randomly permuting sentences. One major obstacle with EDA is the\nneed for versatile and complete synonym dictionaries, which cannot be easily\nfound in low-resource languages. To improve the utility of EDA, we propose two\nextensions, easy distributional data augmentation (EDDA) and type specific\nsimilar word replacement (TSSR), which uses semantic word context information\nand part-of-speech tags for word replacement and augmentation. In an extensive\nempirical evaluation, we show the utility of the proposed methods, measured by\nF1 score, on two representative datasets in Swedish as an example of a\nlow-resource language. With the proposed methods, we show that augmented data\nimprove classification performances in low-resource settings.\n","authors":["Mosleh Mahamud","Zed Lee","Isak Samsten"],"pdf_url":"https://arxiv.org/pdf/2309.04862v1.pdf","comment":"AAAI 2023 Workshop on Knowledge Augmented Methods for NLP"},{"id":"http://arxiv.org/abs/2306.01102v3","updated":"2023-09-09T18:58:26Z","published":"2023-06-01T19:33:21Z","title":"LLMatic: Neural Architecture Search via Large Language Models and\n Quality Diversity Optimization","summary":" Large Language Models (LLMs) have emerged as powerful tools capable of\naccomplishing a broad spectrum of tasks. Their abilities span numerous areas,\nand one area where they have made a significant impact is in the domain of code\ngeneration. In this context, we view LLMs as mutation and crossover tools.\nMeanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and\nrobust solutions. By merging the code-generating abilities of LLMs with the\ndiversity and robustness of QD solutions, we introduce LLMatic, a Neural\nArchitecture Search (NAS) algorithm. While LLMs struggle to conduct NAS\ndirectly through prompts, LLMatic uses a procedural approach, leveraging QD for\nprompts and network architecture to create diverse and highly performant\nnetworks. We test LLMatic on the CIFAR-10 image classification benchmark,\ndemonstrating that it can produce competitive networks with just $2,000$\nsearches, even without prior knowledge of the benchmark domain or exposure to\nany previous top-performing models for the benchmark.\n","authors":["Muhammad U. Nasir","Sam Earle","Julian Togelius","Steven James","Christopher Cleghorn"],"pdf_url":"https://arxiv.org/pdf/2306.01102v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17256v2","updated":"2023-09-09T18:32:00Z","published":"2023-05-26T20:56:30Z","title":"Large Language Models Can be Lazy Learners: Analyze Shortcuts in\n In-Context Learning","summary":" Large language models (LLMs) have recently shown great potential for\nin-context learning, where LLMs learn a new task simply by conditioning on a\nfew input-label pairs (prompts). Despite their potential, our understanding of\nthe factors influencing end-task performance and the robustness of in-context\nlearning remains limited. This paper aims to bridge this knowledge gap by\ninvestigating the reliance of LLMs on shortcuts or spurious correlations within\nprompts. Through comprehensive experiments on classification and extraction\ntasks, we reveal that LLMs are \"lazy learners\" that tend to exploit shortcuts\nin prompts for downstream tasks. Additionally, we uncover a surprising finding\nthat larger models are more likely to utilize shortcuts in prompts during\ninference. Our findings provide a new perspective on evaluating robustness in\nin-context learning and pose new challenges for detecting and mitigating the\nuse of shortcuts in prompts.\n","authors":["Ruixiang Tang","Dehan Kong","Longtao Huang","Hui Xue"],"pdf_url":"https://arxiv.org/pdf/2305.17256v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04858v1","updated":"2023-09-09T18:19:47Z","published":"2023-09-09T18:19:47Z","title":"Reverse-Engineering Decoding Strategies Given Blackbox Access to a\n Language Generation System","summary":" Neural language models are increasingly deployed into APIs and websites that\nallow a user to pass in a prompt and receive generated text. Many of these\nsystems do not reveal generation parameters. In this paper, we present methods\nto reverse-engineer the decoding method used to generate text (i.e., top-$k$ or\nnucleus sampling). Our ability to discover which decoding strategy was used has\nimplications for detecting generated text. Additionally, the process of\ndiscovering the decoding strategy can reveal biases caused by selecting\ndecoding settings which severely truncate a model's predicted distributions. We\nperform our attack on several families of open-source language models, as well\nas on production systems (e.g., ChatGPT).\n","authors":["Daphne Ippolito","Nicholas Carlini","Katherine Lee","Milad Nasr","Yun William Yu"],"pdf_url":"https://arxiv.org/pdf/2309.04858v1.pdf","comment":"6 pages, 4 figures, 3 tables. Also, 5 page appendix. Accepted to INLG\n 2023"},{"id":"http://arxiv.org/abs/2309.04849v1","updated":"2023-09-09T17:30:35Z","published":"2023-09-09T17:30:35Z","title":"Speech Emotion Recognition with Distilled Prosodic and Linguistic Affect\n Representations","summary":" We propose EmoDistill, a novel speech emotion recognition (SER) framework\nthat leverages cross-modal knowledge distillation during training to learn\nstrong linguistic and prosodic representations of emotion from speech. During\ninference, our method only uses a stream of speech signals to perform unimodal\nSER thus reducing computation overhead and avoiding run-time transcription and\nprosodic feature extraction errors. During training, our method distills\ninformation at both embedding and logit levels from a pair of pre-trained\nProsodic and Linguistic teachers that are fine-tuned for SER. Experiments on\nthe IEMOCAP benchmark demonstrate that our method outperforms other unimodal\nand multimodal techniques by a considerable margin, and achieves\nstate-of-the-art performance of 77.49% unweighted accuracy and 78.91% weighted\naccuracy. Detailed ablation studies demonstrate the impact of each component of\nour method.\n","authors":["Debaditya Shome","Ali Etemad"],"pdf_url":"https://arxiv.org/pdf/2309.04849v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.04842v1","updated":"2023-09-09T17:02:33Z","published":"2023-09-09T17:02:33Z","title":"Leveraging Large Language Models for Exploiting ASR Uncertainty","summary":" While large language models excel in a variety of natural language processing\n(NLP) tasks, to perform well on spoken language understanding (SLU) tasks, they\nmust either rely on off-the-shelf automatic speech recognition (ASR) systems\nfor transcription, or be equipped with an in-built speech modality. This work\nfocuses on the former scenario, where LLM's accuracy on SLU tasks is\nconstrained by the accuracy of a fixed ASR system on the spoken input.\nSpecifically, we tackle speech-intent classification task, where a high\nword-error-rate can limit the LLM's ability to understand the spoken intent.\nInstead of chasing a high accuracy by designing complex or specialized\narchitectures regardless of deployment costs, we seek to answer how far we can\ngo without substantially changing the underlying ASR and LLM, which can\npotentially be shared by multiple unrelated tasks. To this end, we propose\nprompting the LLM with an n-best list of ASR hypotheses instead of only the\nerror-prone 1-best hypothesis. We explore prompt-engineering to explain the\nconcept of n-best lists to the LLM; followed by the finetuning of Low-Rank\nAdapters on the downstream tasks. Our approach using n-best lists proves to be\neffective on a device-directed speech detection task as well as on a keyword\nspotting task, where systems using n-best list prompts outperform those using\n1-best ASR hypothesis; thus paving the way for an efficient method to exploit\nASR uncertainty via LLMs for speech-based applications.\n","authors":["Pranay Dighe","Yi Su","Shangshang Zheng","Yunshu Liu","Vineet Garg","Xiaochuan Niu","Ahmed Tewfik"],"pdf_url":"https://arxiv.org/pdf/2309.04842v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04827v1","updated":"2023-09-09T15:51:36Z","published":"2023-09-09T15:51:36Z","title":"Neurons in Large Language Models: Dead, N-gram, Positional","summary":" We analyze a family of large language models in such a lightweight manner\nthat can be done on a single GPU. Specifically, we focus on the OPT family of\nmodels ranging from 125m to 66b parameters and rely only on whether an FFN\nneuron is activated or not. First, we find that the early part of the network\nis sparse and represents many discrete features. Here, many neurons (more than\n70% in some layers of the 66b model) are \"dead\", i.e. they never activate on a\nlarge collection of diverse data. At the same time, many of the alive neurons\nare reserved for discrete features and act as token and n-gram detectors.\nInterestingly, their corresponding FFN updates not only promote next token\ncandidates as could be expected, but also explicitly focus on removing the\ninformation about triggering them tokens, i.e., current input. To the best of\nour knowledge, this is the first example of mechanisms specialized at removing\n(rather than adding) information from the residual stream. With scale, models\nbecome more sparse in a sense that they have more dead neurons and token\ndetectors. Finally, some neurons are positional: them being activated or not\ndepends largely (or solely) on position and less so (or not at all) on textual\ndata. We find that smaller models have sets of neurons acting as position range\nindicators while larger models operate in a less explicit manner.\n","authors":["Elena Voita","Javier Ferrando","Christoforos Nalmpantis"],"pdf_url":"https://arxiv.org/pdf/2309.04827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04823v1","updated":"2023-09-09T15:29:24Z","published":"2023-09-09T15:29:24Z","title":"FaNS: a Facet-based Narrative Similarity Metric","summary":" Similar Narrative Retrieval is a crucial task since narratives are essential\nfor explaining and understanding events, and multiple related narratives often\nhelp to create a holistic view of the event of interest. To accurately identify\nsemantically similar narratives, this paper proposes a novel narrative\nsimilarity metric called Facet-based Narrative Similarity (FaNS), based on the\nclassic 5W1H facets (Who, What, When, Where, Why, and How), which are extracted\nby leveraging the state-of-the-art Large Language Models (LLMs). Unlike\nexisting similarity metrics that only focus on overall lexical/semantic match,\nFaNS provides a more granular matching along six different facets independently\nand then combines them. To evaluate FaNS, we created a comprehensive dataset by\ncollecting narratives from AllSides, a third-party news portal. Experimental\nresults demonstrate that the FaNS metric exhibits a higher correlation (37\\%\nhigher) than traditional text similarity metrics that directly measure the\nlexical/semantic match between narratives, demonstrating its effectiveness in\ncomparing the finer details between a pair of narratives.\n","authors":["Mousumi Akter","Shubhra Kanti Karmaker Santu"],"pdf_url":"https://arxiv.org/pdf/2309.04823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09172v2","updated":"2023-09-09T15:21:27Z","published":"2022-11-16T19:42:31Z","title":"Deep Emotion Recognition in Textual Conversations: A Survey","summary":" While Emotion Recognition in Conversations (ERC) has seen a tremendous\nadvancement in the last few years, new applications and implementation\nscenarios present novel challenges and opportunities. These range from\nleveraging the conversational context, speaker and emotion dynamics modelling,\nto interpreting common sense expressions, informal language and sarcasm,\naddressing challenges of real time ERC, recognizing emotion causes, different\ntaxonomies across datasets, multilingual ERC to interpretability. This survey\nstarts by introducing ERC, elaborating on the challenges and opportunities\npertaining to this task. It proceeds with a description of the emotion\ntaxonomies and a variety of ERC benchmark datasets employing such taxonomies.\nThis is followed by descriptions of the most prominent works in ERC with\nexplanations of the Deep Learning architectures employed. Then, it provides\nadvisable ERC practices towards better frameworks, elaborating on methods to\ndeal with subjectivity in annotations and modelling and methods to deal with\nthe typically unbalanced ERC datasets. Finally, it presents systematic review\ntables comparing several works regarding the methods used and their\nperformance. The survey highlights the advantage of leveraging techniques to\naddress unbalanced data, the exploration of mixed emotions and the benefits of\nincorporating annotation subjectivity in the learning phase.\n","authors":["Patrícia Pereira","Helena Moniz","Joao Paulo Carvalho"],"pdf_url":"https://arxiv.org/pdf/2211.09172v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07619v4","updated":"2023-09-09T15:20:54Z","published":"2023-04-15T19:22:37Z","title":"Can ChatGPT Forecast Stock Price Movements? Return Predictability and\n Large Language Models","summary":" We examine the potential of ChatGPT and other large language models in\npredicting stock market returns using news headlines. We use ChatGPT to assess\nwhether each headline is good, bad, or neutral for firms' stock prices. We\ndocument a significantly positive correlation between ChatGPT scores and\nsubsequent daily stock returns. We find that ChatGPT outperforms traditional\nsentiment analysis methods. More basic models such as GPT-1, GPT-2, and BERT\ncannot accurately forecast returns, indicating return predictability is an\nemerging capacity of complex language models. Long-short strategies based on\nChatGPT-4 deliver the highest Sharpe ratio. Furthermore, we find predictability\nin both small and large stocks, suggesting market underreaction to company\nnews. Predictability is stronger among smaller stocks and stocks with bad news,\nconsistent with limits-to-arbitrage also playing an important role. Finally, we\npropose a new method to evaluate and understand the models' reasoning\ncapabilities. Overall, our results suggest that incorporating advanced language\nmodels into the investment decision-making process can yield more accurate\npredictions and enhance the performance of quantitative trading strategies.\n","authors":["Alejandro Lopez-Lira","Yuehua Tang"],"pdf_url":"https://arxiv.org/pdf/2304.07619v4.pdf","comment":"Previously posted in SSRN\n https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4412788"},{"id":"http://arxiv.org/abs/2309.04790v1","updated":"2023-09-09T13:35:01Z","published":"2023-09-09T13:35:01Z","title":"MMHQA-ICL: Multimodal In-context Learning for Hybrid Question Answering\n over Text, Tables and Images","summary":" In the real world, knowledge often exists in a multimodal and heterogeneous\nform. Addressing the task of question answering with hybrid data types,\nincluding text, tables, and images, is a challenging task (MMHQA). Recently,\nwith the rise of large language models (LLM), in-context learning (ICL) has\nbecome the most popular way to solve QA problems. We propose MMHQA-ICL\nframework for addressing this problems, which includes stronger heterogeneous\ndata retriever and an image caption module. Most importantly, we propose a\nType-specific In-context Learning Strategy for MMHQA, enabling LLMs to leverage\ntheir powerful performance in this task. We are the first to use end-to-end LLM\nprompting method for this task. Experimental results demonstrate that our\nframework outperforms all baselines and methods trained on the full dataset,\nachieving state-of-the-art results under the few-shot setting on the\nMultimodalQA dataset.\n","authors":["Weihao Liu","Fangyu Lei","Tongxu Luo","Jiahe Lei","Shizhu He","Jun Zhao","Kang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04766v1","updated":"2023-09-09T11:42:22Z","published":"2023-09-09T11:42:22Z","title":"SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment\n to Cultural Reasoning","summary":" We present SeaEval, a benchmark for multilingual foundation models. In\naddition to characterizing how these models understand and reason with natural\nlanguage, we also investigate how well they comprehend cultural practices,\nnuances, and values. Alongside standard accuracy metrics, we investigate the\nbrittleness of foundation models in the dimensions of semantics and\nmultilinguality. Our analyses span both open-sourced and closed models, leading\nto empirical results across classic NLP tasks, reasoning, and cultural\ncomprehension. Key findings indicate (1) Most models exhibit varied behavior\nwhen given paraphrased instructions. (2) Many models still suffer from exposure\nbias (e.g., positional bias, majority label bias). (3) For questions rooted in\nfactual, scientific, and commonsense knowledge, consistent responses are\nexpected across multilingual queries that are semantically equivalent. Yet,\nmost models surprisingly demonstrate inconsistent performance on these queries.\n(4) Multilingually-trained models have not attained \"balanced multilingual\"\ncapabilities. Our endeavors underscore the need for more generalizable semantic\nrepresentations and enhanced multilingual contextualization. SeaEval can serve\nas a launchpad for more thorough investigations and evaluations for\nmultilingual and multicultural scenarios.\n","authors":["Bin Wang","Zhengyuan Liu","Xin Huang","Fangkai Jiao","Yang Ding","Ai Ti Aw","Nancy F. Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04766v1.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.04739v1","updated":"2023-09-09T09:56:35Z","published":"2023-09-09T09:56:35Z","title":"Data Augmentation for Conversational AI","summary":" Advancements in conversational systems have revolutionized information\naccess, surpassing the limitations of single queries. However, developing\ndialogue systems requires a large amount of training data, which is a challenge\nin low-resource domains and languages. Traditional data collection methods like\ncrowd-sourcing are labor-intensive and time-consuming, making them ineffective\nin this context. Data augmentation (DA) is an affective approach to alleviate\nthe data scarcity problem in conversational systems. This tutorial provides a\ncomprehensive and up-to-date overview of DA approaches in the context of\nconversational systems. It highlights recent advances in conversation\naugmentation, open domain and task-oriented conversation generation, and\ndifferent paradigms of evaluating these models. We also discuss current\nchallenges and future directions in order to help researchers and practitioners\nto further advance the field in this area.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2309.04739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04734v1","updated":"2023-09-09T09:41:36Z","published":"2023-09-09T09:41:36Z","title":"Towards Better Multi-modal Keyphrase Generation via Visual Entity\n Enhancement and Multi-granularity Image Noise Filtering","summary":" Multi-modal keyphrase generation aims to produce a set of keyphrases that\nrepresent the core points of the input text-image pair. In this regard,\ndominant methods mainly focus on multi-modal fusion for keyphrase generation.\nNevertheless, there are still two main drawbacks: 1) only a limited number of\nsources, such as image captions, can be utilized to provide auxiliary\ninformation. However, they may not be sufficient for the subsequent keyphrase\ngeneration. 2) the input text and image are often not perfectly matched, and\nthus the image may introduce noise into the model. To address these\nlimitations, in this paper, we propose a novel multi-modal keyphrase generation\nmodel, which not only enriches the model input with external knowledge, but\nalso effectively filters image noise. First, we introduce external visual\nentities of the image as the supplementary input to the model, which benefits\nthe cross-modal semantic alignment for keyphrase generation. Second, we\nsimultaneously calculate an image-text matching score and image region-text\ncorrelation scores to perform multi-granularity image noise filtering.\nParticularly, we introduce the correlation scores between image regions and\nground-truth keyphrases to refine the calculation of the previously-mentioned\ncorrelation scores. To demonstrate the effectiveness of our model, we conduct\nseveral groups of experiments on the benchmark dataset.\n Experimental results and in-depth analyses show that our model achieves the\nstate-of-the-art performance. Our code is available on\nhttps://github.com/DeepLearnXMU/MM-MKP.\n","authors":["Yifan Dong","Suhang Wu","Fandong Meng","Jie Zhou","Xiaoli Wang","Jianxin Lin","Jinsong Su"],"pdf_url":"https://arxiv.org/pdf/2309.04734v1.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n Multimedia (MM' 23)"},{"id":"http://arxiv.org/abs/2309.04725v1","updated":"2023-09-09T09:03:50Z","published":"2023-09-09T09:03:50Z","title":"EPA: Easy Prompt Augmentation on Large Language Models via Multiple\n Sources and Multiple Targets","summary":" Large language models (LLMs) have shown promising performance on various NLP\ntasks via task prompting. And their performance can be further improved by\nappending task demonstrations to the head of the prompt. And usually, a better\nperformance can be achieved with more demonstrations. However, asking the users\nto write the demonstrations can be cumbersome. As a simple yet cost-effective\nworkaround, this paper proposes a novel method called EPA (\\textbf{E}asy\n\\textbf{P}rompt \\textbf{A}ugmentation)\\footnote{While this paper considers\naugmenting prompts via demonstrations, we name it EPA as the name EDA is\nalready taken by a well-known NLP method \\citep{wei-zou-2019-eda}.} that\neffectively minimizes user efforts in writing demonstrations while improving\nthe model performance at the same time. EPA achieves these goals by\nautomatically augmenting the demonstrations with multiple sources/targets,\nwhere each of them paraphrases each other. This is well motivated as augmenting\ndata via paraphrasing effectively improves neural language models. EPA thus\nemploys paraphrasing as an augmentation method for in-context learning.\nExtensive experiments indicate that EPA effectively improves both NLU and NLG\ntasks, covering from natural language inference to machine translation in\ntranslating tens of languages.\\footnote{Code and data will be released upon\npublication.}\n","authors":["Hongyuan Lu","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2309.04725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.09847v3","updated":"2023-09-09T08:33:37Z","published":"2022-07-07T22:49:32Z","title":"Predicting Word Learning in Children from the Performance of Computer\n Vision Systems","summary":" For human children as well as machine learning systems, a key challenge in\nlearning a word is linking the word to the visual phenomena it describes. We\nexplore this aspect of word learning by using the performance of computer\nvision systems as a proxy for the difficulty of learning a word from visual\ncues. We show that the age at which children acquire different categories of\nwords is correlated with the performance of visual classification and\ncaptioning systems, over and above the expected effects of word frequency. The\nperformance of the computer vision systems is correlated with human judgments\nof the concreteness of words, which are in turn a predictor of children's word\nlearning, suggesting that these models are capturing the relationship between\nwords and visual phenomena.\n","authors":["Sunayana Rane","Mira L. Nencheva","Zeyu Wang","Casey Lew-Williams","Olga Russakovsky","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2207.09847v3.pdf","comment":"CogSci 2023"},{"id":"http://arxiv.org/abs/2309.04716v1","updated":"2023-09-09T08:07:54Z","published":"2023-09-09T08:07:54Z","title":"Toward Reproducing Network Research Results Using Large Language Models","summary":" Reproducing research results in the networking community is important for\nboth academia and industry. The current best practice typically resorts to\nthree approaches: (1) looking for publicly available prototypes; (2) contacting\nthe authors to get a private prototype; and (3) manually implementing a\nprototype following the description of the publication. However, most published\nnetwork research does not have public prototypes and private prototypes are\nhard to get. As such, most reproducing efforts are spent on manual\nimplementation based on the publications, which is both time and labor\nconsuming and error-prone. In this paper, we boldly propose reproducing network\nresearch results using the emerging large language models (LLMs). In\nparticular, we first prove its feasibility with a small-scale experiment, in\nwhich four students with essential networking knowledge each reproduces a\ndifferent networking system published in prominent conferences and journals by\nprompt engineering ChatGPT. We report the experiment's observations and lessons\nand discuss future open research questions of this proposal. This work raises\nno ethical issue.\n","authors":["Qiao Xiang","Yuling Lin","Mingjun Fang","Bang Huang","Siyong Huang","Ridi Wen","Franck Le","Linghe Kong","Jiwu Shu"],"pdf_url":"https://arxiv.org/pdf/2309.04716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04704v1","updated":"2023-09-09T07:10:19Z","published":"2023-09-09T07:10:19Z","title":"Analysis of Disinformation and Fake News Detection Using Fine-Tuned\n Large Language Model","summary":" The paper considers the possibility of fine-tuning Llama 2 large language\nmodel (LLM) for the disinformation analysis and fake news detection. For\nfine-tuning, the PEFT/LoRA based approach was used. In the study, the model was\nfine-tuned for the following tasks: analysing a text on revealing\ndisinformation and propaganda narratives, fact checking, fake news detection,\nmanipulation analytics, extracting named entities with their sentiments. The\nobtained results show that the fine-tuned Llama 2 model can perform a deep\nanalysis of texts and reveal complex styles and narratives. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2309.04704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04695v1","updated":"2023-09-09T06:27:00Z","published":"2023-09-09T06:27:00Z","title":"Code-Style In-Context Learning for Knowledge-Based Question Answering","summary":" Current methods for Knowledge-Based Question Answering (KBQA) usually rely on\ncomplex training techniques and model frameworks, leading to many limitations\nin practical applications. Recently, the emergence of In-Context Learning (ICL)\ncapabilities in Large Language Models (LLMs) provides a simple and\ntraining-free semantic parsing paradigm for KBQA: Given a small number of\nquestions and their labeled logical forms as demo examples, LLMs can understand\nthe task intent and generate the logic form for a new question. However,\ncurrent powerful LLMs have little exposure to logic forms during pre-training,\nresulting in a high format error rate. To solve this problem, we propose a\ncode-style in-context learning method for KBQA, which converts the generation\nprocess of unfamiliar logical form into the more familiar code generation\nprocess for LLMs. Experimental results on three mainstream datasets show that\nour method dramatically mitigated the formatting error problem in generating\nlogic forms while realizing a new SOTA on WebQSP, GrailQA, and GraphQ under the\nfew-shot setting.\n","authors":["Zhijie Nie","Richong Zhang","Zhongyuan Wang","Xudong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04695v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2309.04679v1","updated":"2023-09-09T04:27:18Z","published":"2023-09-09T04:27:18Z","title":"Embedding structure matters: Comparing methods to adapt multilingual\n vocabularies to new languages","summary":" Pre-trained multilingual language models underpin a large portion of modern\nNLP tools outside of English. A strong baseline for specializing these models\nfor specific languages is Language-Adaptive Pre-Training (LAPT). However,\nretaining a large cross-lingual vocabulary and embedding matrix comes at\nconsiderable excess computational cost during adaptation. In this study, we\npropose several simple techniques to replace a cross-lingual vocabulary with a\ncompact, language-specific one. Namely, we address strategies for\nre-initializing the token embedding matrix after vocabulary specialization. We\nthen provide a systematic experimental comparison of our techniques, in\naddition to the recently-proposed Focus method. We demonstrate that: 1)\nEmbedding-replacement techniques in the monolingual transfer literature are\ninadequate for adapting multilingual models. 2) Replacing cross-lingual\nvocabularies with smaller specialized ones provides an efficient method to\nimprove performance in low-resource languages. 3) Simple embedding\nre-initialization techniques based on script-wise sub-distributions rival\ntechniques such as Focus, which rely on similarity scores obtained from an\nauxiliary model.\n","authors":["C. M. Downey","Terra Blevins","Nora Goldfine","Shane Steinert-Threlkeld"],"pdf_url":"https://arxiv.org/pdf/2309.04679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14115v2","updated":"2023-09-09T03:50:04Z","published":"2023-06-25T03:34:06Z","title":"Towards Trustworthy Explanation: On Causal Rationalization","summary":" With recent advances in natural language processing, rationalization becomes\nan essential self-explaining diagram to disentangle the black box by selecting\na subset of input texts to account for the major variation in prediction. Yet,\nexisting association-based approaches on rationalization cannot identify true\nrationales when two or more snippets are highly inter-correlated and thus\nprovide a similar contribution to prediction accuracy, so-called spuriousness.\nTo address this limitation, we novelly leverage two causal desiderata,\nnon-spuriousness and efficiency, into rationalization from the causal inference\nperspective. We formally define a series of probabilities of causation based on\na newly proposed structural causal model of rationalization, with its\ntheoretical identification established as the main component of learning\nnecessary and sufficient rationales. The superior performance of the proposed\ncausal rationalization is demonstrated on real-world review and medical\ndatasets with extensive experiments compared to state-of-the-art methods.\n","authors":["Wenbo Zhang","Tong Wu","Yunlong Wang","Yong Cai","Hengrui Cai"],"pdf_url":"https://arxiv.org/pdf/2306.14115v2.pdf","comment":"In Proceedings of the 40th International Conference on Machine\n Learning (ICML) GitHub Repository:\n https://github.com/onepounchman/Causal-Retionalization"},{"id":"http://arxiv.org/abs/2308.11224v2","updated":"2023-09-09T03:14:10Z","published":"2023-08-22T06:32:07Z","title":"Evaluating Large Language Models on Graphs: Performance Insights and\n Comparative Analysis","summary":" Large Language Models (LLMs) have garnered considerable interest within both\nacademic and industrial. Yet, the application of LLMs to graph data remains\nunder-explored. In this study, we evaluate the capabilities of four LLMs in\naddressing several analytical problems with graph data. We employ four distinct\nevaluation metrics: Comprehension, Correctness, Fidelity, and Rectification.\nOur results show that: 1) LLMs effectively comprehend graph data in natural\nlanguage and reason with graph topology. 2) GPT models can generate logical and\ncoherent results, outperforming alternatives in correctness. 3) All examined\nLLMs face challenges in structural reasoning, with techniques like zero-shot\nchain-of-thought and few-shot prompting showing diminished efficacy. 4) GPT\nmodels often produce erroneous answers in multi-answer tasks, raising concerns\nin fidelity. 5) GPT models exhibit elevated confidence in their outputs,\npotentially hindering their rectification capacities. Notably, GPT-4 has\ndemonstrated the capacity to rectify responses from GPT-3.5-turbo and its own\nprevious iterations. The code is available at:\nhttps://github.com/Ayame1006/LLMtoGraph.\n","authors":["Chang Liu","Bo Wu"],"pdf_url":"https://arxiv.org/pdf/2308.11224v2.pdf","comment":"12 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.04663v1","updated":"2023-09-09T02:43:48Z","published":"2023-09-09T02:43:48Z","title":"FIAT: Fusing learning paradigms with Instruction-Accelerated Tuning","summary":" Learning paradigms for large language models (LLMs) currently tend to fall\nwithin either in-context learning (ICL) or full fine-tuning. Each of these\ncomes with their own trade-offs based on available data, model size, compute\ncost, ease-of-use, and final quality with neither solution performing well\nacross-the-board. In this article, we first describe ICL and fine-tuning\nparadigms in a way that highlights their natural connections. Based on these\nconnections, we propose a new learning paradigm called FIAT that fuses the best\nof these paradigms together, enabling prompt-engineered instructions and\nchain-of-thought reasoning with the very largest models while also using\nsimilar methods to perform parameter updates on a modestly-sized LLM with\nparameter-efficient tuning. We evaluate FIAT's effectiveness on a variety of\nmultilingual tasks and observe that FIAT performs better than both ICL and\nfine-tuning at scales ranging from 100-10,000 training examples. We hope that\nFIAT provides a practical way of harnessing the full potential of LLMs without\nneeding to make a hard choice between learning paradigms.\n","authors":["Xinyi Wang","John Wieting","Jonathan H. Clark"],"pdf_url":"https://arxiv.org/pdf/2309.04663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04662v1","updated":"2023-09-09T02:34:01Z","published":"2023-09-09T02:34:01Z","title":"MADLAD-400: A Multilingual And Document-Level Large Audited Dataset","summary":" We introduce MADLAD-400, a manually audited, general domain 3T token\nmonolingual dataset based on CommonCrawl, spanning 419 languages. We discuss\nthe limitations revealed by self-auditing MADLAD-400, and the role data\nauditing had in the dataset creation process. We then train and release a\n10.7B-parameter multilingual machine translation model on 250 billion tokens\ncovering over 450 languages using publicly available data, and find that it is\ncompetitive with models that are significantly larger, and report the results\non different domains. In addition, we train a 8B-parameter language model, and\nassess the results on few-shot translation. We make the baseline models\navailable to the research community.\n","authors":["Sneha Kudugunta","Isaac Caswell","Biao Zhang","Xavier Garcia","Christopher A. Choquette-Choo","Katherine Lee","Derrick Xin","Aditya Kusupati","Romi Stella","Ankur Bapna","Orhan Firat"],"pdf_url":"https://arxiv.org/pdf/2309.04662v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2309.04658v1","updated":"2023-09-09T01:56:40Z","published":"2023-09-09T01:56:40Z","title":"Exploring Large Language Models for Communication Games: An Empirical\n Study on Werewolf","summary":" Communication games, which we refer to as incomplete information games that\nheavily depend on natural language communication, hold significant research\nvalue in fields such as economics, social science, and artificial intelligence.\nIn this work, we explore the problem of how to engage large language models\n(LLMs) in communication games, and in response, propose a tuning-free\nframework. Our approach keeps LLMs frozen, and relies on the retrieval and\nreflection on past communications and experiences for improvement. An empirical\nstudy on the representative and widely-studied communication game,\n``Werewolf'', demonstrates that our framework can effectively play Werewolf\ngame without tuning the parameters of the LLMs. More importantly, strategic\nbehaviors begin to emerge in our experiments, suggesting that it will be a\nfruitful journey to engage LLMs in communication games and associated domains.\n","authors":["Yuzhuang Xu","Shuo Wang","Peng Li","Fuwen Luo","Xiaolong Wang","Weidong Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.04658v1.pdf","comment":"23 pages, 5 figures and 4 tables"},{"id":"http://arxiv.org/abs/2309.04646v1","updated":"2023-09-09T00:11:53Z","published":"2023-09-09T00:11:53Z","title":"Efficient Finetuning Large Language Models For Vietnamese Chatbot","summary":" Large language models (LLMs), such as GPT-4, PaLM, and LLaMa, have been shown\nto achieve remarkable performance across a variety of natural language tasks.\nRecent advancements in instruction tuning bring LLMs with ability in following\nuser's instructions and producing human-like responses. However, the high costs\nassociated with training and implementing LLMs pose challenges to academic\nresearch. Furthermore, the availability of pretrained LLMs and instruction-tune\ndatasets for Vietnamese language is limited. To tackle these concerns, we\nleverage large-scale instruction-following datasets from open-source projects,\nnamely Alpaca, GPT4All, and Chat-Doctor, which cover general domain and\nspecific medical domain. To the best of our knowledge, these are the first\ninstructional dataset for Vietnamese. Subsequently, we utilize\nparameter-efficient tuning through Low-Rank Adaptation (LoRA) on two open LLMs:\nBloomz (Multilingual) and GPTJ-6B (Vietnamese), resulting four models:\nBloomz-Chat, Bloomz-Doctor, GPTJ-Chat, GPTJ-Doctor.Finally, we assess the\neffectiveness of our methodology on a per-sample basis, taking into\nconsideration the helpfulness, relevance, accuracy, level of detail in their\nresponses. This evaluation process entails the utilization of GPT-4 as an\nautomated scoring mechanism. Despite utilizing a low-cost setup, our method\ndemonstrates about 20-30\\% improvement over the original models in our\nevaluation tasks.\n","authors":["Vu-Thuan Doan","Quoc-Truong Truong","Duc-Vu Nguyen","Vinh-Tiep Nguyen","Thuy-Ngan Nguyen Luu"],"pdf_url":"https://arxiv.org/pdf/2309.04646v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.08177,\n arXiv:2303.16199 by other authors"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.04891v1","updated":"2023-09-09T23:03:50Z","published":"2023-09-09T23:03:50Z","title":"How to Evaluate Semantic Communications for Images with ViTScore Metric?","summary":" Semantic communications (SC) have been expected to be a new paradigm shifting\nto catalyze the next generation communication, whose main concerns shift from\naccurate bit transmission to effective semantic information exchange in\ncommunications. However, the previous and widely-used metrics for images are\nnot applicable to evaluate the image semantic similarity in SC. Classical\nmetrics to measure the similarity between two images usually rely on the pixel\nlevel or the structural level, such as the PSNR and the MS-SSIM.\nStraightforwardly using some tailored metrics based on deep-learning methods in\nCV community, such as the LPIPS, is infeasible for SC. To tackle this, inspired\nby BERTScore in NLP community, we propose a novel metric for evaluating image\nsemantic similarity, named Vision Transformer Score (ViTScore). We prove\ntheoretically that ViTScore has 3 important properties, including symmetry,\nboundedness, and normalization, which make ViTScore convenient and intuitive\nfor image measurement. To evaluate the performance of ViTScore, we compare\nViTScore with 3 typical metrics (PSNR, MS-SSIM, and LPIPS) through 5 classes of\nexperiments. Experimental results demonstrate that ViTScore can better evaluate\nthe image semantic similarity than the other 3 typical metrics, which indicates\nthat ViTScore is an effective performance metric when deployed in SC scenarios.\n","authors":["Tingting Zhu","Bo Peng","Jifan Liang","Tingchen Han","Hai Wan","Jingqiao Fu","Junjie Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04888v1","updated":"2023-09-09T22:55:25Z","published":"2023-09-09T22:55:25Z","title":"Semi-supervised Instance Segmentation with a Learned Shape Prior","summary":" To date, most instance segmentation approaches are based on supervised\nlearning that requires a considerable amount of annotated object contours as\ntraining ground truth. Here, we propose a framework that searches for the\ntarget object based on a shape prior. The shape prior model is learned with a\nvariational autoencoder that requires only a very limited amount of training\ndata: In our experiments, a few dozens of object shape patches from the target\ndataset, as well as purely synthetic shapes, were sufficient to achieve results\nen par with supervised methods with full access to training data on two out of\nthree cell segmentation datasets. Our method with a synthetic shape prior was\nsuperior to pre-trained supervised models with access to limited\ndomain-specific training data on all three datasets. Since the learning of\nprior models requires shape patches, whether real or synthetic data, we call\nthis framework semi-supervised learning.\n","authors":["Long Chen","Weiwen Zhang","Yuli Wu","Martin Strauch","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2309.04888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04887v1","updated":"2023-09-09T22:50:35Z","published":"2023-09-09T22:50:35Z","title":"SortedAP: Rethinking evaluation metrics for instance segmentation","summary":" Designing metrics for evaluating instance segmentation revolves around\ncomprehensively considering object detection and segmentation accuracy.\nHowever, other important properties, such as sensitivity, continuity, and\nequality, are overlooked in the current study. In this paper, we reveal that\nmost existing metrics have a limited resolution of segmentation quality. They\nare only conditionally sensitive to the change of masks or false predictions.\nFor certain metrics, the score can change drastically in a narrow range which\ncould provide a misleading indication of the quality gap between results.\nTherefore, we propose a new metric called sortedAP, which strictly decreases\nwith both object- and pixel-level imperfections and has an uninterrupted\npenalization scale over the entire domain. We provide the evaluation toolkit\nand experiment code at https://www.github.com/looooongChen/sortedAP.\n","authors":["Long Chen","Yuli Wu","Johannes Stegmaier","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2309.04887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09164v3","updated":"2023-09-09T20:43:13Z","published":"2023-01-22T17:12:58Z","title":"Unifying Synergies between Self-supervised Learning and Dynamic\n Computation","summary":" Computationally expensive training strategies make self-supervised learning\n(SSL) impractical for resource constrained industrial settings. Techniques like\nknowledge distillation (KD), dynamic computation (DC), and pruning are often\nused to obtain a lightweightmodel, which usually involves multiple epochs of\nfine-tuning (or distilling steps) of a large pre-trained model, making it more\ncomputationally challenging. In this work we present a novel perspective on the\ninterplay between SSL and DC paradigms. In particular, we show that it is\nfeasible to simultaneously learn a dense and gated sub-network from scratch in\na SSL setting without any additional fine-tuning or pruning steps. The\nco-evolution during pre-training of both dense and gated encoder offers a good\naccuracy-efficiency trade-off and therefore yields a generic and multi-purpose\narchitecture for application specific industrial settings. Extensive\nexperiments on several image classification benchmarks including CIFAR-10/100,\nSTL-10 and ImageNet-100, demonstrate that the proposed training strategy\nprovides a dense and corresponding gated sub-network that achieves on-par\nperformance compared with the vanilla self-supervised setting, but at a\nsignificant reduction in computation in terms of FLOPs, under a range of target\nbudgets (td ).\n","authors":["Tarun Krishna","Ayush K Rai","Alexandru Drimbarean","Eric Arazo","Paul Albert","Alan F Smeaton","Kevin McGuinness","Noel E O'Connor"],"pdf_url":"https://arxiv.org/pdf/2301.09164v3.pdf","comment":"Accepted in BMVC 2023"},{"id":"http://arxiv.org/abs/2309.04840v1","updated":"2023-09-09T16:59:57Z","published":"2023-09-09T16:59:57Z","title":"AnyPose: Anytime 3D Human Pose Forecasting via Neural Ordinary\n Differential Equations","summary":" Anytime 3D human pose forecasting is crucial to synchronous real-world\nhuman-machine interaction, where the term ``anytime\" corresponds to predicting\nhuman pose at any real-valued time step. However, to the best of our knowledge,\nall the existing methods in human pose forecasting perform predictions at\npreset, discrete time intervals. Therefore, we introduce AnyPose, a lightweight\ncontinuous-time neural architecture that models human behavior dynamics with\nneural ordinary differential equations. We validate our framework on the\nHuman3.6M, AMASS, and 3DPW dataset and conduct a series of comprehensive\nanalyses towards comparison with existing methods and the intersection of human\npose and neural ordinary differential equations. Our results demonstrate that\nAnyPose exhibits high-performance accuracy in predicting future poses and takes\nsignificantly lower computational time than traditional methods in solving\nanytime prediction tasks.\n","authors":["Zixing Wang","Ahmed H. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2309.04840v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.04884v1","updated":"2023-09-09T22:23:05Z","published":"2023-09-09T22:23:05Z","title":"RecAD: Towards A Unified Library for Recommender Attack and Defense","summary":" In recent years, recommender systems have become a ubiquitous part of our\ndaily lives, while they suffer from a high risk of being attacked due to the\ngrowing commercial and social values. Despite significant research progress in\nrecommender attack and defense, there is a lack of a widely-recognized\nbenchmarking standard in the field, leading to unfair performance comparison\nand limited credibility of experiments. To address this, we propose RecAD, a\nunified library aiming at establishing an open benchmark for recommender attack\nand defense. RecAD takes an initial step to set up a unified benchmarking\npipeline for reproducible research by integrating diverse datasets, standard\nsource codes, hyper-parameter settings, running logs, attack knowledge, attack\nbudget, and evaluation results. The benchmark is designed to be comprehensive\nand sustainable, covering both attack, defense, and evaluation tasks, enabling\nmore researchers to easily follow and contribute to this promising field. RecAD\nwill drive more solid and reproducible research on recommender systems attack\nand defense, reduce the redundant efforts of researchers, and ultimately\nincrease the credibility and practical value of recommender attack and defense.\nThe project is released at https://github.com/gusye1234/recad.\n","authors":["Changsheng Wang","Jianbai Ye","Wenjie Wang","Chongming Gao","Fuli Feng","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2309.04884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04861v1","updated":"2023-09-09T19:01:12Z","published":"2023-09-09T19:01:12Z","title":"Exploring Music Genre Classification: Algorithm Analysis and Deployment\n Architecture","summary":" Music genre classification has become increasingly critical with the advent\nof various streaming applications. Nowadays, we find it impossible to imagine\nusing the artist's name and song title to search for music in a sophisticated\nmusic app. It is always difficult to classify music correctly because the\ninformation linked to music, such as region, artist, album, or non-album, is so\nvariable. This paper presents a study on music genre classification using a\ncombination of Digital Signal Processing (DSP) and Deep Learning (DL)\ntechniques. A novel algorithm is proposed that utilizes both DSP and DL methods\nto extract relevant features from audio signals and classify them into various\ngenres. The algorithm was tested on the GTZAN dataset and achieved high\naccuracy. An end-to-end deployment architecture is also proposed for\nintegration into music-related applications. The performance of the algorithm\nis analyzed and future directions for improvement are discussed. The proposed\nDSP and DL-based music genre classification algorithm and deployment\narchitecture demonstrate a promising approach for music genre classification.\n","authors":["Ayan Biswas","Supriya Dhabal","Palaniandavar Venkateswaran"],"pdf_url":"https://arxiv.org/pdf/2309.04861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04802v1","updated":"2023-09-09T14:07:11Z","published":"2023-09-09T14:07:11Z","title":"CPMR: Context-Aware Incremental Sequential Recommendation with\n Pseudo-Multi-Task Learning","summary":" The motivations of users to make interactions can be divided into static\npreference and dynamic interest. To accurately model user representations over\ntime, recent studies in sequential recommendation utilize information\npropagation and evolution to mine from batches of arriving interactions.\nHowever, they ignore the fact that people are easily influenced by the recent\nactions of other users in the contextual scenario, and applying evolution\nacross all historical interactions dilutes the importance of recent ones, thus\nfailing to model the evolution of dynamic interest accurately. To address this\nissue, we propose a Context-Aware Pseudo-Multi-Task Recommender System (CPMR)\nto model the evolution in both historical and contextual scenarios by creating\nthree representations for each user and item under different dynamics: static\nembedding, historical temporal states, and contextual temporal states. To\ndually improve the performance of temporal states evolution and incremental\nrecommendation, we design a Pseudo-Multi-Task Learning (PMTL) paradigm by\nstacking the incremental single-target recommendations into one multi-target\ntask for joint optimization. Within the PMTL paradigm, CPMR employs a\nshared-bottom network to conduct the evolution of temporal states across\nhistorical and contextual scenarios, as well as the fusion of them at the\nuser-item level. In addition, CPMR incorporates one real tower for incremental\npredictions, and two pseudo towers dedicated to updating the respective\ntemporal states based on new batches of interactions. Experimental results on\nfour benchmark recommendation datasets show that CPMR consistently outperforms\nstate-of-the-art baselines and achieves significant gains on three of them. The\ncode is available at: https://github.com/DiMarzioBian/CPMR.\n","authors":["Qingtian Bian","Jiaxing Xu","Hui Fang","Yiping Ke"],"pdf_url":"https://arxiv.org/pdf/2309.04802v1.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2304.10711v2","updated":"2023-09-09T13:14:59Z","published":"2023-04-21T02:48:29Z","title":"EulerNet: Adaptive Feature Interaction Learning via Euler's Formula for\n CTR Prediction","summary":" Learning effective high-order feature interactions is very crucial in the CTR\nprediction task. However, it is very time-consuming to calculate high-order\nfeature interactions with massive features in online e-commerce platforms. Most\nexisting methods manually design a maximal order and further filter out the\nuseless interactions from them. Although they reduce the high computational\ncosts caused by the exponential growth of high-order feature combinations, they\nstill suffer from the degradation of model capability due to the suboptimal\nlearning of the restricted feature orders. The solution to maintain the model\ncapability and meanwhile keep it efficient is a technical challenge, which has\nnot been adequately addressed. To address this issue, we propose an adaptive\nfeature interaction learning model, named as EulerNet, in which the feature\ninteractions are learned in a complex vector space by conducting space mapping\naccording to Euler's formula. EulerNet converts the exponential powers of\nfeature interactions into simple linear combinations of the modulus and phase\nof the complex features, making it possible to adaptively learn the high-order\nfeature interactions in an efficient way. Furthermore, EulerNet incorporates\nthe implicit and explicit feature interactions into a unified architecture,\nwhich achieves the mutual enhancement and largely boosts the model\ncapabilities. Such a network can be fully learned from data, with no need of\npre-designed form or order for feature interactions. Extensive experiments\nconducted on three public datasets have demonstrated the effectiveness and\nefficiency of our approach. Our code is available at:\nhttps://github.com/RUCAIBox/EulerNet.\n","authors":["Zhen Tian","Ting Bai","Wayne Xin Zhao","Ji-Rong Wen","Zhao Cao"],"pdf_url":"https://arxiv.org/pdf/2304.10711v2.pdf","comment":"10 pages, 7 figures, accepted for publication in SIGIR'23"},{"id":"http://arxiv.org/abs/2309.04761v1","updated":"2023-09-09T11:20:40Z","published":"2023-09-09T11:20:40Z","title":"A Comprehensive Survey on Deep Learning Techniques in Educational Data\n Mining","summary":" Educational Data Mining (EDM) has emerged as a vital field of research, which\nharnesses the power of computational techniques to analyze educational data.\nWith the increasing complexity and diversity of educational data, Deep Learning\ntechniques have shown significant advantages in addressing the challenges\nassociated with analyzing and modeling this data. This survey aims to\nsystematically review the state-of-the-art in EDM with Deep Learning. We begin\nby providing a brief introduction to EDM and Deep Learning, highlighting their\nrelevance in the context of modern education. Next, we present a detailed\nreview of Deep Learning techniques applied in four typical educational\nscenarios, including knowledge tracing, undesirable student detecting,\nperformance prediction, and personalized recommendation. Furthermore, a\ncomprehensive overview of public datasets and processing tools for EDM is\nprovided. Finally, we point out emerging trends and future directions in this\nresearch area.\n","authors":["Yuanguo Lin","Hong Chen","Wei Xia","Fan Lin","Pengcheng Wu","Zongyue Wang","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2309.04761v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.04739v1","updated":"2023-09-09T09:56:35Z","published":"2023-09-09T09:56:35Z","title":"Data Augmentation for Conversational AI","summary":" Advancements in conversational systems have revolutionized information\naccess, surpassing the limitations of single queries. However, developing\ndialogue systems requires a large amount of training data, which is a challenge\nin low-resource domains and languages. Traditional data collection methods like\ncrowd-sourcing are labor-intensive and time-consuming, making them ineffective\nin this context. Data augmentation (DA) is an affective approach to alleviate\nthe data scarcity problem in conversational systems. This tutorial provides a\ncomprehensive and up-to-date overview of DA approaches in the context of\nconversational systems. It highlights recent advances in conversation\naugmentation, open domain and task-oriented conversation generation, and\ndifferent paradigms of evaluating these models. We also discuss current\nchallenges and future directions in order to help researchers and practitioners\nto further advance the field in this area.\n","authors":["Heydar Soudani","Evangelos Kanoulas","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2309.04739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04704v1","updated":"2023-09-09T07:10:19Z","published":"2023-09-09T07:10:19Z","title":"Analysis of Disinformation and Fake News Detection Using Fine-Tuned\n Large Language Model","summary":" The paper considers the possibility of fine-tuning Llama 2 large language\nmodel (LLM) for the disinformation analysis and fake news detection. For\nfine-tuning, the PEFT/LoRA based approach was used. In the study, the model was\nfine-tuned for the following tasks: analysing a text on revealing\ndisinformation and propaganda narratives, fact checking, fake news detection,\nmanipulation analytics, extracting named entities with their sentiments. The\nobtained results show that the fine-tuned Llama 2 model can perform a deep\nanalysis of texts and reveal complex styles and narratives. Extracted\nsentiments for named entities can be considered as predictive features in\nsupervised machine learning models.\n","authors":["Bohdan M. Pavlyshenko"],"pdf_url":"https://arxiv.org/pdf/2309.04704v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2308.05734v2","updated":"2023-09-09T15:33:19Z","published":"2023-08-10T17:55:13Z","title":"AudioLDM 2: Learning Holistic Audio Generation with Self-supervised\n Pretraining","summary":" Although audio generation shares commonalities across different types of\naudio, such as speech, music, and sound effects, designing models for each type\nrequires careful consideration of specific objectives and biases that can\nsignificantly differ from those of other types. To bring us closer to a unified\nperspective of audio generation, this paper proposes a framework that utilizes\nthe same learning method for speech, music, and sound effect generation. Our\nframework introduces a general representation of audio, called \"language of\naudio\" (LOA). Any audio can be translated into LOA based on AudioMAE, a\nself-supervised pre-trained representation learning model. In the generation\nprocess, we translate any modalities into LOA by using a GPT-2 model, and we\nperform self-supervised audio generation learning with a latent diffusion model\nconditioned on LOA. The proposed framework naturally brings advantages such as\nin-context learning abilities and reusable self-supervised pretrained AudioMAE\nand latent diffusion models. Experiments on the major benchmarks of\ntext-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art\nor competitive performance against previous approaches. Our code, pretrained\nmodel, and demo are available at https://audioldm.github.io/audioldm2.\n","authors":["Haohe Liu","Qiao Tian","Yi Yuan","Xubo Liu","Xinhao Mei","Qiuqiang Kong","Yuping Wang","Wenwu Wang","Yuxuan Wang","Mark D. Plumbley"],"pdf_url":"https://arxiv.org/pdf/2308.05734v2.pdf","comment":"AudioLDM 2 project page is https://audioldm.github.io/audioldm2"},{"id":"http://arxiv.org/abs/2301.12503v3","updated":"2023-09-09T15:27:58Z","published":"2023-01-29T17:48:17Z","title":"AudioLDM: Text-to-Audio Generation with Latent Diffusion Models","summary":" Text-to-audio (TTA) system has recently gained attention for its ability to\nsynthesize general audio based on text descriptions. However, previous studies\nin TTA have limited generation quality with high computational costs. In this\nstudy, we propose AudioLDM, a TTA system that is built on a latent space to\nlearn the continuous audio representations from contrastive language-audio\npretraining (CLAP) latents. The pretrained CLAP models enable us to train LDMs\nwith audio embedding while providing text embedding as a condition during\nsampling. By learning the latent representations of audio signals and their\ncompositions without modeling the cross-modal relationship, AudioLDM is\nadvantageous in both generation quality and computational efficiency. Trained\non AudioCaps with a single GPU, AudioLDM achieves state-of-the-art TTA\nperformance measured by both objective and subjective metrics (e.g., frechet\ndistance). Moreover, AudioLDM is the first TTA system that enables various\ntext-guided audio manipulations (e.g., style transfer) in a zero-shot fashion.\nOur implementation and demos are available at https://audioldm.github.io.\n","authors":["Haohe Liu","Zehua Chen","Yi Yuan","Xinhao Mei","Xubo Liu","Danilo Mandic","Wenwu Wang","Mark D. Plumbley"],"pdf_url":"https://arxiv.org/pdf/2301.12503v3.pdf","comment":"Accepted by ICML 2023. Demo and implementation at\n https://audioldm.github.io. Evaluation toolbox at\n https://github.com/haoheliu/audioldm_eval"},{"id":"http://arxiv.org/abs/2309.04734v1","updated":"2023-09-09T09:41:36Z","published":"2023-09-09T09:41:36Z","title":"Towards Better Multi-modal Keyphrase Generation via Visual Entity\n Enhancement and Multi-granularity Image Noise Filtering","summary":" Multi-modal keyphrase generation aims to produce a set of keyphrases that\nrepresent the core points of the input text-image pair. In this regard,\ndominant methods mainly focus on multi-modal fusion for keyphrase generation.\nNevertheless, there are still two main drawbacks: 1) only a limited number of\nsources, such as image captions, can be utilized to provide auxiliary\ninformation. However, they may not be sufficient for the subsequent keyphrase\ngeneration. 2) the input text and image are often not perfectly matched, and\nthus the image may introduce noise into the model. To address these\nlimitations, in this paper, we propose a novel multi-modal keyphrase generation\nmodel, which not only enriches the model input with external knowledge, but\nalso effectively filters image noise. First, we introduce external visual\nentities of the image as the supplementary input to the model, which benefits\nthe cross-modal semantic alignment for keyphrase generation. Second, we\nsimultaneously calculate an image-text matching score and image region-text\ncorrelation scores to perform multi-granularity image noise filtering.\nParticularly, we introduce the correlation scores between image regions and\nground-truth keyphrases to refine the calculation of the previously-mentioned\ncorrelation scores. To demonstrate the effectiveness of our model, we conduct\nseveral groups of experiments on the benchmark dataset.\n Experimental results and in-depth analyses show that our model achieves the\nstate-of-the-art performance. Our code is available on\nhttps://github.com/DeepLearnXMU/MM-MKP.\n","authors":["Yifan Dong","Suhang Wu","Fandong Meng","Jie Zhou","Xiaoli Wang","Jianxin Lin","Jinsong Su"],"pdf_url":"https://arxiv.org/pdf/2309.04734v1.pdf","comment":"Accepted In Proceedings of the 31st ACM International Conference on\n Multimedia (MM' 23)"}]},"2023-09-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2212.04634v3","updated":"2023-09-12T17:38:30Z","published":"2022-12-09T02:19:07Z","title":"Open-world Story Generation with Structured Knowledge Enhancement: A\n Comprehensive Survey","summary":" Storytelling and narrative are fundamental to human experience, intertwined\nwith our social and cultural engagement. As such, researchers have long\nattempted to create systems that can generate stories automatically. In recent\nyears, powered by deep learning and massive data resources, automatic story\ngeneration has shown significant advances. However, considerable challenges,\nlike the need for global coherence in generated stories, still hamper\ngenerative models from reaching the same storytelling ability as human\nnarrators. To tackle these challenges, many studies seek to inject structured\nknowledge into the generation process, which is referred to as structured\nknowledge-enhanced story generation. Incorporating external knowledge can\nenhance the logical coherence among story events, achieve better knowledge\ngrounding, and alleviate over-generalization and repetition problems in\nstories. This survey provides the latest and comprehensive review of this\nresearch field: (i) we present a systematic taxonomy regarding how existing\nmethods integrate structured knowledge into story generation; (ii) we summarize\ninvolved story corpora, structured knowledge datasets, and evaluation metrics;\n(iii) we give multidimensional insights into the challenges of\nknowledge-enhanced story generation and cast light on promising directions for\nfuture study.\n","authors":["Yuxin Wang","Jieru Lin","Zhiwei Yu","Wei Hu","Börje F. Karlsson"],"pdf_url":"https://arxiv.org/pdf/2212.04634v3.pdf","comment":"Accepted in Neurocomputing"},{"id":"http://arxiv.org/abs/2309.04842v2","updated":"2023-09-12T16:46:26Z","published":"2023-09-09T17:02:33Z","title":"Leveraging Large Language Models for Exploiting ASR Uncertainty","summary":" While large language models excel in a variety of natural language processing\n(NLP) tasks, to perform well on spoken language understanding (SLU) tasks, they\nmust either rely on off-the-shelf automatic speech recognition (ASR) systems\nfor transcription, or be equipped with an in-built speech modality. This work\nfocuses on the former scenario, where LLM's accuracy on SLU tasks is\nconstrained by the accuracy of a fixed ASR system on the spoken input.\nSpecifically, we tackle speech-intent classification task, where a high\nword-error-rate can limit the LLM's ability to understand the spoken intent.\nInstead of chasing a high accuracy by designing complex or specialized\narchitectures regardless of deployment costs, we seek to answer how far we can\ngo without substantially changing the underlying ASR and LLM, which can\npotentially be shared by multiple unrelated tasks. To this end, we propose\nprompting the LLM with an n-best list of ASR hypotheses instead of only the\nerror-prone 1-best hypothesis. We explore prompt-engineering to explain the\nconcept of n-best lists to the LLM; followed by the finetuning of Low-Rank\nAdapters on the downstream tasks. Our approach using n-best lists proves to be\neffective on a device-directed speech detection task as well as on a keyword\nspotting task, where systems using n-best list prompts outperform those using\n1-best ASR hypothesis; thus paving the way for an efficient method to exploit\nASR uncertainty via LLMs for speech-based applications.\n","authors":["Pranay Dighe","Yi Su","Shangshang Zheng","Yunshu Liu","Vineet Garg","Xiaochuan Niu","Ahmed Tewfik"],"pdf_url":"https://arxiv.org/pdf/2309.04842v2.pdf","comment":"Added references"},{"id":"http://arxiv.org/abs/2303.13592v4","updated":"2023-09-12T16:35:30Z","published":"2023-03-23T18:16:30Z","title":"Prompting Multilingual Large Language Models to Generate Code-Mixed\n Texts: The Case of South East Asian Languages","summary":" While code-mixing is a common linguistic practice in many parts of the world,\ncollecting high-quality and low-cost code-mixed data remains a challenge for\nnatural language processing (NLP) research. The recent proliferation of Large\nLanguage Models (LLMs) compels one to ask: how capable are these systems in\ngenerating code-mixed data? In this paper, we explore prompting multilingual\nLLMs in a zero-shot manner to generate code-mixed data for seven languages in\nSouth East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese,\nTamil, and Singlish. We find that publicly available multilingual\ninstruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of\nproducing texts with phrases or clauses from different languages. ChatGPT\nexhibits inconsistent capabilities in generating code-mixed texts, wherein its\nperformance varies depending on the prompt template and language pairing. For\ninstance, ChatGPT generates fluent and natural Singlish texts (an English-based\ncreole spoken in Singapore), but for English-Tamil language pair, the system\nmostly produces grammatically incorrect or semantically meaningless utterances.\nFurthermore, it may erroneously introduce languages not specified in the\nprompt. Based on our investigation, existing multilingual LLMs exhibit a wide\nrange of proficiency in code-mixed data generation for SEA languages. As such,\nwe advise against using LLMs in this context without extensive human checks.\n","authors":["Zheng-Xin Yong","Ruochen Zhang","Jessica Zosa Forde","Skyler Wang","Arjun Subramonian","Holy Lovenia","Samuel Cahyawijaya","Genta Indra Winata","Lintang Sutawika","Jan Christian Blaise Cruz","Yin Lin Tan","Long Phan","Rowena Garcia","Thamar Solorio","Alham Fikri Aji"],"pdf_url":"https://arxiv.org/pdf/2303.13592v4.pdf","comment":"Updating Authors"},{"id":"http://arxiv.org/abs/2308.01936v2","updated":"2023-09-12T16:33:15Z","published":"2023-08-02T21:13:38Z","title":"Why Do We Need Neuro-symbolic AI to Model Pragmatic Analogies?","summary":" A hallmark of intelligence is the ability to use a familiar domain to make\ninferences about a less familiar domain, known as analogical reasoning. In this\narticle, we delve into the performance of Large Language Models (LLMs) in\ndealing with progressively complex analogies expressed in unstructured text. We\ndiscuss analogies at four distinct levels of complexity: lexical analogies,\nsyntactic analogies, semantic analogies, and pragmatic analogies. As the\nanalogies become more complex, they require increasingly extensive, diverse\nknowledge beyond the textual content, unlikely to be found in the lexical\nco-occurrence statistics that power LLMs. To address this, we discuss the\nnecessity of employing Neuro-symbolic AI techniques that combine statistical\nand symbolic AI, informing the representation of unstructured text to highlight\nand augment relevant content, provide abstraction and guide the mapping\nprocess. Our knowledge-informed approach maintains the efficiency of LLMs while\npreserving the ability to explain analogies for pedagogical applications.\n","authors":["Thilini Wijesiriwardene","Amit Sheth","Valerie L. Shalin","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2308.01936v2.pdf","comment":"12 pages 3 figures"},{"id":"http://arxiv.org/abs/2309.06365v1","updated":"2023-09-12T16:28:36Z","published":"2023-09-12T16:28:36Z","title":"Cited Text Spans for Citation Text Generation","summary":" Automatic related work generation must ground their outputs to the content of\nthe cited papers to avoid non-factual hallucinations, but due to the length of\nscientific documents, existing abstractive approaches have conditioned only on\nthe cited paper \\textit{abstracts}. We demonstrate that the abstract is not\nalways the most appropriate input for citation generation and that models\ntrained in this way learn to hallucinate. We propose to condition instead on\nthe \\textit{cited text span} (CTS) as an alternative to the abstract. Because\nmanual CTS annotation is extremely time- and labor-intensive, we experiment\nwith automatic, ROUGE-based labeling of candidate CTS sentences, achieving\nsufficiently strong performance to substitute for expensive human annotations,\nand we propose a human-in-the-loop, keyword-based CTS retrieval approach that\nmakes generating citation texts grounded in the full text of cited papers both\npromising and practical.\n","authors":["Xiangci Li","Yi-Hui Lee","Jessica Ouyang"],"pdf_url":"https://arxiv.org/pdf/2309.06365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06363v1","updated":"2023-09-12T16:27:18Z","published":"2023-09-12T16:27:18Z","title":"Learning to Predict Concept Ordering for Common Sense Generation","summary":" Prior work has shown that the ordering in which concepts are shown to a\ncommonsense generator plays an important role, affecting the quality of the\ngenerated sentence. However, it remains a challenge to determine the optimal\nordering of a given set of concepts such that a natural sentence covering all\nthe concepts could be generated from a pretrained generator. To understand the\nrelationship between the ordering of the input concepts and the quality of the\ngenerated sentences, we conduct a systematic study considering multiple\nlanguage models (LMs) and concept ordering strategies. We find that BART-large\nmodel consistently outperforms all other LMs considered in this study when\nfine-tuned using the ordering of concepts as they appear in CommonGen training\ndata as measured using multiple evaluation metrics. Moreover, the larger\nGPT3-based large language models (LLMs) variants do not necessarily outperform\nmuch smaller LMs on this task, even when fine-tuned on task-specific training\ndata. Interestingly, human annotators significantly reorder input concept sets\nwhen manually writing sentences covering those concepts, and this ordering\nprovides the best sentence generations independently of the LM used for the\ngeneration, outperforming a probabilistic concept ordering baseline\n","authors":["Tianhui Zhang","Danushka Bollegala","Bei Peng"],"pdf_url":"https://arxiv.org/pdf/2309.06363v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2212.07919v2","updated":"2023-09-12T15:08:46Z","published":"2022-12-15T15:52:39Z","title":"ROSCOE: A Suite of Metrics for Scoring Step-by-Step Reasoning","summary":" Large language models show improved downstream task performance when prompted\nto generate step-by-step reasoning to justify their final answers. These\nreasoning steps greatly improve model interpretability and verification, but\nobjectively studying their correctness (independent of the final answer) is\ndifficult without reliable methods for automatic evaluation. We simply do not\nknow how often the stated reasoning steps actually support the final end task\npredictions. In this work, we present ROSCOE, a suite of interpretable,\nunsupervised automatic scores that improve and extend previous text generation\nevaluation metrics. To evaluate ROSCOE against baseline metrics, we design a\ntypology of reasoning errors and collect synthetic and human evaluation scores\non commonly used reasoning datasets. In contrast with existing metrics, ROSCOE\ncan measure semantic consistency, logicality, informativeness, fluency, and\nfactuality - among other traits - by leveraging properties of step-by-step\nrationales. We empirically verify the strength of our metrics on five human\nannotated and six programmatically perturbed diagnostics datasets - covering a\ndiverse set of tasks that require reasoning skills and show that ROSCOE can\nconsistently outperform baseline metrics.\n","authors":["Olga Golovneva","Moya Chen","Spencer Poff","Martin Corredor","Luke Zettlemoyer","Maryam Fazel-Zarandi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2212.07919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.03592v3","updated":"2023-09-12T15:01:52Z","published":"2022-04-07T17:12:57Z","title":"Testing the limits of natural language models for predicting human\n language judgments","summary":" Neural network language models can serve as computational hypotheses about\nhow humans process language. We compared the model-human consistency of diverse\nlanguage models using a novel experimental approach: controversial sentence\npairs. For each controversial sentence pair, two language models disagree about\nwhich sentence is more likely to occur in natural text. Considering nine\nlanguage models (including n-gram, recurrent neural networks, and transformer\nmodels), we created hundreds of such controversial sentence pairs by either\nselecting sentences from a corpus or synthetically optimizing sentence pairs to\nbe highly controversial. Human subjects then provided judgments indicating for\neach pair which of the two sentences is more likely. Controversial sentence\npairs proved highly effective at revealing model failures and identifying\nmodels that aligned most closely with human judgments. The most\nhuman-consistent model tested was GPT-2, although experiments also revealed\nsignificant shortcomings of its alignment with human perception.\n","authors":["Tal Golan","Matthew Siegelman","Nikolaus Kriegeskorte","Christopher Baldassano"],"pdf_url":"https://arxiv.org/pdf/2204.03592v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06275v1","updated":"2023-09-12T14:36:23Z","published":"2023-09-12T14:36:23Z","title":"Re-Reading Improves Reasoning in Language Models","summary":" Reasoning presents a significant and challenging issue for Large Language\nModels (LLMs). The predominant focus of research has revolved around developing\ndiverse prompting strategies to guide and structure the reasoning processes of\nLLMs. However, these approaches based on decoder-only causal language models\noften operate the input question in a single forward pass, potentially missing\nthe rich, back-and-forth interactions inherent in human reasoning. Scant\nattention has been paid to a critical dimension, i.e., the input question\nitself embedded within the prompts. In response, we introduce a deceptively\nsimple yet highly effective prompting strategy, termed question \"re-reading\".\nDrawing inspiration from human learning and problem-solving, re-reading entails\nrevisiting the question information embedded within input prompts. This\napproach aligns seamlessly with the cognitive principle of reinforcement,\nenabling LLMs to extract deeper insights, identify intricate patterns,\nestablish more nuanced connections, and ultimately enhance their reasoning\ncapabilities across various tasks. Experiments conducted on a series of\nreasoning benchmarks serve to underscore the effectiveness and generality of\nour method. Moreover, our findings demonstrate that our approach seamlessly\nintegrates with various language models, though-eliciting prompting methods,\nand ensemble techniques, further underscoring its versatility and compatibility\nin the realm of LLMs.\n","authors":["Xiaohan Xu","Chongyang Tao","Tao Shen","Can Xu","Hongbo Xu","Guodong Long","Jian-guang Lou"],"pdf_url":"https://arxiv.org/pdf/2309.06275v1.pdf","comment":"25 pages"},{"id":"http://arxiv.org/abs/2309.04663v2","updated":"2023-09-12T14:34:03Z","published":"2023-09-09T02:43:48Z","title":"FIAT: Fusing learning paradigms with Instruction-Accelerated Tuning","summary":" Learning paradigms for large language models (LLMs) currently tend to fall\nwithin either in-context learning (ICL) or full fine-tuning. Each of these\ncomes with their own trade-offs based on available data, model size, compute\ncost, ease-of-use, and final quality with neither solution performing well\nacross-the-board. In this article, we first describe ICL and fine-tuning\nparadigms in a way that highlights their natural connections. Based on these\nconnections, we propose a new learning paradigm called FIAT that fuses the best\nof these paradigms together, enabling prompt-engineered instructions and\nchain-of-thought reasoning with the very largest models while also using\nsimilar methods to perform parameter updates on a modestly-sized LLM with\nparameter-efficient tuning. We evaluate FIAT's effectiveness on a variety of\nmultilingual tasks and observe that FIAT performs better than both ICL and\nfine-tuning at scales ranging from 100-10,000 training examples. We hope that\nFIAT provides a practical way of harnessing the full potential of LLMs without\nneeding to make a hard choice between learning paradigms.\n","authors":["Xinyi Wang","John Wieting","Jonathan H. Clark"],"pdf_url":"https://arxiv.org/pdf/2309.04663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03378v2","updated":"2023-09-12T14:07:54Z","published":"2023-09-06T21:56:24Z","title":"RoDia: A New Dataset for Romanian Dialect Identification from Speech","summary":" Dialect identification is a critical task in speech processing and language\ntechnology, enhancing various applications such as speech recognition, speaker\nverification, and many others. While most research studies have been dedicated\nto dialect identification in widely spoken languages, limited attention has\nbeen given to dialect identification in low-resource languages, such as\nRomanian. To address this research gap, we introduce RoDia, the first dataset\nfor Romanian dialect identification from speech. The RoDia dataset includes a\nvaried compilation of speech samples from five distinct regions of Romania,\ncovering both urban and rural environments, totaling 2 hours of manually\nannotated speech data. Along with our dataset, we introduce a set of\ncompetitive models to be used as baselines for future research. The top scoring\nmodel achieves a macro F1 score of 59.83% and a micro F1 score of 62.08%,\nindicating that the task is challenging. We thus believe that RoDia is a\nvaluable resource that will stimulate research aiming to address the challenges\nof Romanian dialect identification. We publicly release our dataset and code at\nhttps://github.com/codrut2/RoDia.\n","authors":["Codrut Rotaru","Nicolae-Catalin Ristea","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2309.03378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06236v1","updated":"2023-09-12T13:51:29Z","published":"2023-09-12T13:51:29Z","title":"The first step is the hardest: Pitfalls of Representing and Tokenizing\n Temporal Data for Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable generalization\nacross diverse tasks, leading individuals to increasingly use them as personal\nassistants and universal computing engines. Nevertheless, a notable obstacle\nemerges when feeding numerical/temporal data into these models, such as data\nsourced from wearables or electronic health records. LLMs employ tokenizers in\ntheir input that break down text into smaller units. However, tokenizers are\nnot designed to represent numerical values and might struggle to understand\nrepetitive patterns and context, treating consecutive values as separate tokens\nand disregarding their temporal relationships. Here, we discuss recent works\nthat employ LLMs for human-centric tasks such as in mobile health sensing and\npresent a case study showing that popular LLMs tokenize temporal data\nincorrectly. To address that, we highlight potential solutions such as prompt\ntuning with lightweight embedding layers as well as multimodal adapters, that\ncan help bridge this \"modality gap\". While the capability of language models to\ngeneralize to other modalities with minimal or no finetuning is exciting, this\npaper underscores the fact that their outputs cannot be meaningful if they\nstumble over input nuances.\n","authors":["Dimitris Spathis","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2309.06236v1.pdf","comment":"Accepted at the Generative AI for Pervasive Computing Symposium\n (GenAI4PC) at UbiComp 2023"},{"id":"http://arxiv.org/abs/2309.04198v2","updated":"2023-09-12T13:51:14Z","published":"2023-09-08T08:20:46Z","title":"The CALLA Dataset: Probing LLMs' Interactive Knowledge Acquisition from\n Chinese Medical Literature","summary":" The application of Large Language Models (LLMs) to the medical domain has\nstimulated the interest of researchers. Recent studies have focused on\nconstructing Instruction Fine-Tuning (IFT) data through medical knowledge\ngraphs to enrich the interactive medical knowledge of LLMs. However, the\nmedical literature serving as a rich source of medical knowledge remains\nunexplored. Our work introduces the CALLA dataset to probe LLMs' interactive\nknowledge acquisition from Chinese medical literature. It assesses the\nproficiency of LLMs in mastering medical knowledge through a free-dialogue\nfact-checking task. We identify a phenomenon called the ``fact-following\nresponse``, where LLMs tend to affirm facts mentioned in questions and display\na reluctance to challenge them. To eliminate the inaccurate evaluation caused\nby this phenomenon, for the golden fact, we artificially construct test data\nfrom two perspectives: one consistent with the fact and one inconsistent with\nthe fact. Drawing from the probing experiment on the CALLA dataset, we conclude\nthat IFT data highly correlated with the medical literature corpus serves as a\npotent catalyst for LLMs, enabling themselves to skillfully employ the medical\nknowledge acquired during the pre-training phase within interactive scenarios,\nenhancing accuracy. Furthermore, we design a framework for automatically\nconstructing IFT data based on medical literature and discuss some real-world\napplications.\n","authors":["Yanrui Du","Sendong Zhao","Muzhen Cai","Jianyu Chen","Haochun Wang","Yuhan Chen","Haoqiang Guo","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2309.04198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06219v1","updated":"2023-09-12T13:38:44Z","published":"2023-09-12T13:38:44Z","title":"Human Action Co-occurrence in Lifestyle Vlogs using Graph Link\n Prediction","summary":" We introduce the task of automatic human action co-occurrence identification,\ni.e., determine whether two human actions can co-occur in the same interval of\ntime. We create and make publicly available the ACE (Action Co-occurrencE)\ndataset, consisting of a large graph of ~12k co-occurring pairs of visual\nactions and their corresponding video clips. We describe graph link prediction\nmodels that leverage visual and textual information to automatically infer if\ntwo actions are co-occurring. We show that graphs are particularly well suited\nto capture relations between human actions, and the learned graph\nrepresentations are effective for our task and capture novel and relevant\ninformation across different data domains. The ACE dataset and the code\nintroduced in this paper are publicly available at\nhttps://github.com/MichiganNLP/vlog_action_co-occurrence.\n","authors":["Oana Ignat","Santiago Castro","Weiji Li","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2309.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02654v2","updated":"2023-09-12T13:34:00Z","published":"2023-09-06T01:57:36Z","title":"Zero-Resource Hallucination Prevention for Large Language Models","summary":" The prevalent use of large language models (LLMs) in various domains has\ndrawn attention to the issue of \"hallucination,\" which refers to instances\nwhere LLMs generate factually inaccurate or ungrounded information. Existing\ntechniques for hallucination detection in language assistants rely on intricate\nfuzzy, specific free-language-based chain of thought (CoT) techniques or\nparameter-based methods that suffer from interpretability issues. Additionally,\nthe methods that identify hallucinations post-generation could not prevent\ntheir occurrence and suffer from inconsistent performance due to the influence\nof the instruction format and model style. In this paper, we introduce a novel\npre-detection self-evaluation technique, referred to as SELF-FAMILIARITY, which\nfocuses on evaluating the model's familiarity with the concepts present in the\ninput instruction and withholding the generation of response in case of\nunfamiliar concepts. This approach emulates the human ability to refrain from\nresponding to unfamiliar topics, thus reducing hallucinations. We validate\nSELF-FAMILIARITY across four different large language models, demonstrating\nconsistently superior performance compared to existing techniques. Our findings\npropose a significant shift towards preemptive strategies for hallucination\nmitigation in LLM assistants, promising improvements in reliability,\napplicability, and interpretability.\n","authors":["Junyu Luo","Cao Xiao","Fenglong Ma"],"pdf_url":"https://arxiv.org/pdf/2309.02654v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06192v1","updated":"2023-09-12T13:01:20Z","published":"2023-09-12T13:01:20Z","title":"Improving and Evaluating the Detection of Fragmentation in News\n Recommendations with the Clustering of News Story Chains","summary":" News recommender systems play an increasingly influential role in shaping\ninformation access within democratic societies. However, tailoring\nrecommendations to users' specific interests can result in the divergence of\ninformation streams. Fragmented access to information poses challenges to the\nintegrity of the public sphere, thereby influencing democracy and public\ndiscourse. The Fragmentation metric quantifies the degree of fragmentation of\ninformation streams in news recommendations. Accurate measurement of this\nmetric requires the application of Natural Language Processing (NLP) to\nidentify distinct news events, stories, or timelines. This paper presents an\nextensive investigation of various approaches for quantifying Fragmentation in\nnews recommendations. These approaches are evaluated both intrinsically, by\nmeasuring performance on news story clustering, and extrinsically, by assessing\nthe Fragmentation scores of different simulated news recommender scenarios. Our\nfindings demonstrate that agglomerative hierarchical clustering coupled with\nSentenceBERT text representation is substantially better at detecting\nFragmentation than earlier implementations. Additionally, the analysis of\nsimulated scenarios yields valuable insights and recommendations for\nstakeholders concerning the measurement and interpretation of Fragmentation.\n","authors":["Alessandra Polimeno","Myrthe Reuver","Sanne Vrijenhoek","Antske Fokkens"],"pdf_url":"https://arxiv.org/pdf/2309.06192v1.pdf","comment":"Cite published version: Polimeno et. al., Improving and Evaluating\n the Detection of Fragmentation in News Recommendations with the Clustering of\n News Story Chains, NORMalize 2023: The First Workshop on the Normative Design\n and Evaluation of Recommender Systems, September 19, 2023, co-located with\n the ACM Conference on Recommender Systems 2023 (RecSys 2023), Singapore"},{"id":"http://arxiv.org/abs/2309.06179v1","updated":"2023-09-12T12:46:20Z","published":"2023-09-12T12:46:20Z","title":"Glancing Future for Simultaneous Machine Translation","summary":" Simultaneous machine translation (SiMT) outputs translation while reading the\nsource sentence. Unlike conventional sequence-to-sequence (seq2seq) training,\nexisting SiMT methods adopt the prefix-to-prefix (prefix2prefix) training,\nwhere the model predicts target tokens based on partial source tokens. However,\nthe prefix2prefix training diminishes the ability of the model to capture\nglobal information and introduces forced predictions due to the absence of\nessential source information. Consequently, it is crucial to bridge the gap\nbetween the prefix2prefix training and seq2seq training to enhance the\ntranslation capability of the SiMT model. In this paper, we propose a novel\nmethod that glances future in curriculum learning to achieve the transition\nfrom the seq2seq training to prefix2prefix training. Specifically, we gradually\nreduce the available source information from the whole sentence to the prefix\ncorresponding to that latency. Our method is applicable to a wide range of SiMT\nmethods and experiments demonstrate that our method outperforms strong\nbaselines.\n","authors":["Shoutao Guo","Shaolei Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2309.06179v1.pdf","comment":"5 pages, 4 figure, Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.06175v1","updated":"2023-09-12T12:37:37Z","published":"2023-09-12T12:37:37Z","title":"AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity\n Recognition and Linking","summary":" This paper presents a novel approach to address the Entity Recognition and\nLinking Challenge at NLPCC 2015. The task involves extracting named entity\nmentions from short search queries and linking them to entities within a\nreference Chinese knowledge base. To tackle this problem, we first expand the\nexisting knowledge base and utilize external knowledge to identify candidate\nentities, thereby improving the recall rate. Next, we extract features from the\ncandidate entities and utilize Support Vector Regression and Multiple Additive\nRegression Tree as scoring functions to filter the results. Additionally, we\napply rules to further refine the results and enhance precision. Our method is\ncomputationally efficient and achieves an F1 score of 0.535.\n","authors":["Di Lu","Zhongping Liang","Caixia Yuan","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06163v1","updated":"2023-09-12T12:18:18Z","published":"2023-09-12T12:18:18Z","title":"Overview of GUA-SPA at IberLEF 2023: Guarani-Spanish Code Switching\n Analysis","summary":" We present the first shared task for detecting and analyzing code-switching\nin Guarani and Spanish, GUA-SPA at IberLEF 2023. The challenge consisted of\nthree tasks: identifying the language of a token, NER, and a novel task of\nclassifying the way a Spanish span is used in the code-switched context. We\nannotated a corpus of 1500 texts extracted from news articles and tweets,\naround 25 thousand tokens, with the information for the tasks. Three teams took\npart in the evaluation phase, obtaining in general good results for Task 1, and\nmore mixed results for Tasks 2 and 3.\n","authors":["Luis Chiruzzo","Marvin Agüero-Torales","Gustavo Giménez-Lugo","Aldo Alvarez","Yliana Rodríguez","Santiago Góngora","Thamar Solorio"],"pdf_url":"https://arxiv.org/pdf/2309.06163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05557v2","updated":"2023-09-12T12:15:38Z","published":"2023-09-11T15:45:40Z","title":"An Empirical Study of NetOps Capability of Pre-Trained Large Language\n Models","summary":" Large language models (LLMs) can respond to human language queries and have\nshown powerful potential applications in network operations (NetOps). Thanks to\nthe large amount of commonsense knowledge inherent, LLMs achieve much better\ninference accuracy than traditional models and emerge with strong abilities in\ngeneralization, reasoning, and code generation. These abilities may have a\ncrucial boost to automated and intelligent NetOps. However, it remains\nunder-explored how well LLMs perform in various NetOps tasks. In this work, we\nmake a systematic assessment of the capabilities, strengths, and limitations of\nselected LLMs in the field of NetOps. The evaluation is conducted on a\ncollection of 5,732 questions about NetOps, encompassing 26 publicly available\ngeneral-domain LLMs, including ChatGPT, LLaMA, Falcon, etc. We also finetune\nsome of these LLMs with our collected NetOps corpus and evaluate the resulting\nmodels. The evaluation method follows the widely adopted benchmarks for\ngeneral-domain LLMs, combined with Chain-of-Thought Prompts and\nRetrieval-Augmented Generation. The results show that only GPT-4 achieves high\naccuracy equivalent to passing the NetOps certification exam for humans, while\nall the other LLMs have much lower accuracy. However, some open models like\nLLaMA 2 still demonstrate significant potential. Furthermore, we evaluate the\nimpact of factors such as model parameters, prompt engineering, instruction\nfine-tuning etc. This work shall be treated as the initial effort to systematic\nevaluation of LLMs in NetOps, and a more rigorous study is required for\nproduction use. The evaluation code and dataset will be released to benefit\nfuture research.\n","authors":["Yukai Miao","Yu Bai","Li Chen","Dan Li","Haifeng Sun","Xizheng Wang","Ziqiu Luo","Dapeng Sun","Xiuting Xu","Qi Zhang","Chao Xiang","Xinchi Li"],"pdf_url":"https://arxiv.org/pdf/2309.05557v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06135v1","updated":"2023-09-12T11:19:36Z","published":"2023-09-12T11:19:36Z","title":"Prompting4Debugging: Red-Teaming Text-to-Image Diffusion Models by\n Finding Problematic Prompts","summary":" Text-to-image diffusion models, e.g. Stable Diffusion (SD), lately have shown\nremarkable ability in high-quality content generation, and become one of the\nrepresentatives for the recent wave of transformative AI. Nevertheless, such\nadvance comes with an intensifying concern about the misuse of this generative\ntechnology, especially for producing copyrighted or NSFW (i.e. not safe for\nwork) images. Although efforts have been made to filter inappropriate\nimages/prompts or remove undesirable concepts/styles via model fine-tuning, the\nreliability of these safety mechanisms against diversified problematic prompts\nremains largely unexplored. In this work, we propose Prompting4Debugging (P4D)\nas a debugging and red-teaming tool that automatically finds problematic\nprompts for diffusion models to test the reliability of a deployed safety\nmechanism. We demonstrate the efficacy of our P4D tool in uncovering new\nvulnerabilities of SD models with safety mechanisms. Particularly, our result\nshows that around half of prompts in existing safe prompting benchmarks which\nwere originally considered \"safe\" can actually be manipulated to bypass many\ndeployed safety mechanisms, including concept removal, negative prompt, and\nsafety guidance. Our findings suggest that, without comprehensive testing, the\nevaluations on limited safe prompting benchmarks can lead to a false sense of\nsafety for text-to-image models.\n","authors":["Zhi-Yi Chin","Chieh-Ming Jiang","Ching-Chun Huang","Pin-Yu Chen","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2309.06135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06132v1","updated":"2023-09-12T11:18:29Z","published":"2023-09-12T11:18:29Z","title":"Measuring vagueness and subjectivity in texts: from symbolic to neural\n VAGO","summary":" We present a hybrid approach to the automated measurement of vagueness and\nsubjectivity in texts. We first introduce the expert system VAGO, we illustrate\nit on a small benchmark of fact vs. opinion sentences, and then test it on the\nlarger French press corpus FreSaDa to confirm the higher prevalence of\nsubjective markers in satirical vs. regular texts. We then build a neural clone\nof VAGO, based on a BERT-like architecture, trained on the symbolic VAGO scores\nobtained on FreSaDa. Using explainability tools (LIME), we show the interest of\nthis neural version for the enrichment of the lexicons of the symbolic version,\nand for the production of versions in other languages.\n","authors":["Benjamin Icard","Vincent Claveau","Ghislain Atemezing","Paul Égré"],"pdf_url":"https://arxiv.org/pdf/2309.06132v1.pdf","comment":"Paper to appear in the Proceedings of the 2023 IEEE International\n Conference on Web Intelligence and Intelligent Agent Technology (WI-IAT)"},{"id":"http://arxiv.org/abs/2309.06131v1","updated":"2023-09-12T11:17:42Z","published":"2023-09-12T11:17:42Z","title":"Annotating Data for Fine-Tuning a Neural Ranker? Current Active Learning\n Strategies are not Better than Random Selection","summary":" Search methods based on Pretrained Language Models (PLM) have demonstrated\ngreat effectiveness gains compared to statistical and early neural ranking\nmodels. However, fine-tuning PLM-based rankers requires a great amount of\nannotated training data. Annotating data involves a large manual effort and\nthus is expensive, especially in domain specific tasks. In this paper we\ninvestigate fine-tuning PLM-based rankers under limited training data and\nbudget. We investigate two scenarios: fine-tuning a ranker from scratch, and\ndomain adaptation starting with a ranker already fine-tuned on general data,\nand continuing fine-tuning on a target dataset. We observe a great variability\nin effectiveness when fine-tuning on different randomly selected subsets of\ntraining data. This suggests that it is possible to achieve effectiveness gains\nby actively selecting a subset of the training data that has the most positive\neffect on the rankers. This way, it would be possible to fine-tune effective\nPLM rankers at a reduced annotation budget. To investigate this, we adapt\nexisting Active Learning (AL) strategies to the task of fine-tuning PLM rankers\nand investigate their effectiveness, also considering annotation and\ncomputational costs. Our extensive analysis shows that AL strategies do not\nsignificantly outperform random selection of training subsets in terms of\neffectiveness. We further find that gains provided by AL strategies come at the\nexpense of more assessments (thus higher annotation costs) and AL strategies\nunderperform random selection when comparing effectiveness given a fixed\nannotation cost. Our results highlight that ``optimal'' subsets of training\ndata that provide high effectiveness at low annotation cost do exist, but\ncurrent mainstream AL strategies applied to PLM rankers are not capable of\nidentifying them.\n","authors":["Sophia Althammer","Guido Zuccon","Sebastian Hofstätter","Suzan Verberne","Allan Hanbury"],"pdf_url":"https://arxiv.org/pdf/2309.06131v1.pdf","comment":"Accepted at SIGIR-AP 2023"},{"id":"http://arxiv.org/abs/2309.06126v1","updated":"2023-09-12T11:02:27Z","published":"2023-09-12T11:02:27Z","title":"AstroLLaMA: Towards Specialized Foundation Models in Astronomy","summary":" Large language models excel in many human-language tasks but often falter in\nhighly specialized domains like scholarly astronomy. To bridge this gap, we\nintroduce AstroLLaMA, a 7-billion-parameter model fine-tuned from LLaMA-2 using\nover 300,000 astronomy abstracts from arXiv. Optimized for traditional causal\nlanguage modeling, AstroLLaMA achieves a 30% lower perplexity than Llama-2,\nshowing marked domain adaptation. Our model generates more insightful and\nscientifically relevant text completions and embedding extraction than\nstate-of-the-arts foundation models despite having significantly fewer\nparameters. AstroLLaMA serves as a robust, domain-specific model with broad\nfine-tuning potential. Its public release aims to spur astronomy-focused\nresearch, including automatic paper summarization and conversational agent\ndevelopment.\n","authors":["Tuan Dung Nguyen","Yuan-Sen Ting","Ioana Ciucă","Charlie O'Neill","Ze-Chang Sun","Maja Jabłońska","Sandor Kruk","Ernest Perkowski","Jack Miller","Jason Li","Josh Peek","Kartheik Iyer","Tomasz Różański","Pranav Khetarpal","Sharaf Zaman","David Brodrick","Sergio J. Rodríguez Méndez","Thang Bui","Alyssa Goodman","Alberto Accomazzi","Jill Naiman","Jesse Cranney","Kevin Schawinski"," UniverseTBD"],"pdf_url":"https://arxiv.org/pdf/2309.06126v1.pdf","comment":"6 pages, 3 figures, submitted to IJCNLP-AACL 2023. Comments are\n welcome. The model can be found on Hugging Face -\n https://huggingface.co/universeTBD/astrollama"},{"id":"http://arxiv.org/abs/2309.03241v2","updated":"2023-09-12T11:01:25Z","published":"2023-09-06T06:18:16Z","title":"GPT Can Solve Mathematical Problems Without a Calculator","summary":" Previous studies have typically assumed that large language models are unable\nto accurately perform arithmetic operations, particularly multiplication of >8\ndigits, and operations involving decimals and fractions, without the use of\ncalculator tools. This paper aims to challenge this misconception. With\nsufficient training data, a 2 billion-parameter language model can accurately\nperform multi-digit arithmetic operations with almost 100% accuracy without\ndata leakage, significantly surpassing GPT-4 (whose multi-digit multiplication\naccuracy is only 4.3%). We also demonstrate that our MathGLM, fine-tuned from\nGLM-10B on a dataset with additional multi-step arithmetic operations and math\nproblems described in text, achieves similar performance to GPT-4 on a\n5,000-samples Chinese math problem test set. Our code and data are public at\nhttps://github.com/THUDM/MathGLM.\n","authors":["Zhen Yang","Ming Ding","Qingsong Lv","Zhihuan Jiang","Zehai He","Yuyi Guo","Jinfeng Bai","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2309.03241v2.pdf","comment":"26pages,14figures"},{"id":"http://arxiv.org/abs/2309.06112v1","updated":"2023-09-12T10:27:39Z","published":"2023-09-12T10:27:39Z","title":"Characterizing Latent Perspectives of Media Houses Towards Public\n Figures","summary":" Media houses reporting on public figures, often come with their own biases\nstemming from their respective worldviews. A characterization of these\nunderlying patterns helps us in better understanding and interpreting news\nstories. For this, we need diverse or subjective summarizations, which may not\nbe amenable for classifying into predefined class labels. This work proposes a\nzero-shot approach for non-extractive or generative characterizations of person\nentities from a corpus using GPT-2. We use well-articulated articles from\nseveral well-known news media houses as a corpus to build a sound argument for\nthis approach. First, we fine-tune a GPT-2 pre-trained language model with a\ncorpus where specific person entities are characterized. Second, we further\nfine-tune this with demonstrations of person entity characterizations, created\nfrom a corpus of programmatically constructed characterizations. This twice\nfine-tuned model is primed with manual prompts consisting of entity names that\nwere not previously encountered in the second fine-tuning, to generate a simple\nsentence about the entity. The results were encouraging, when compared against\nactual characterizations from the corpus.\n","authors":["Sharath Srivatsa","Srinath Srinivasa"],"pdf_url":"https://arxiv.org/pdf/2309.06112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06105v1","updated":"2023-09-12T10:17:28Z","published":"2023-09-12T10:17:28Z","title":"Towards Visual Taxonomy Expansion","summary":" Taxonomy expansion task is essential in organizing the ever-increasing volume\nof new concepts into existing taxonomies. Most existing methods focus\nexclusively on using textual semantics, leading to an inability to generalize\nto unseen terms and the \"Prototypical Hypernym Problem.\" In this paper, we\npropose Visual Taxonomy Expansion (VTE), introducing visual features into the\ntaxonomy expansion task. We propose a textual hypernymy learning task and a\nvisual prototype learning task to cluster textual and visual semantics. In\naddition to the tasks on respective modalities, we introduce a hyper-proto\nconstraint that integrates textual and visual semantics to produce fine-grained\nvisual semantics. Our method is evaluated on two datasets, where we obtain\ncompelling results. Specifically, on the Chinese taxonomy dataset, our method\nsignificantly improves accuracy by 8.75 %. Additionally, our approach performs\nbetter than ChatGPT on the Chinese taxonomy dataset.\n","authors":["Tinghui Zhu","Jingping Liu","Jiaqing Liang","Haiyun Jiang","Yanghua Xiao","Zongyu Wang","Rui Xie","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2309.06105v1.pdf","comment":"ACMMM accepted paper"},{"id":"http://arxiv.org/abs/2309.06089v1","updated":"2023-09-12T09:37:08Z","published":"2023-09-12T09:37:08Z","title":"Measuring Catastrophic Forgetting in Cross-Lingual Transfer Paradigms:\n Exploring Tuning Strategies","summary":" The cross-lingual transfer is a promising technique to solve tasks in\nless-resourced languages. In this empirical study, we compare two fine-tuning\napproaches combined with zero-shot and full-shot learning approaches for large\nlanguage models in a cross-lingual setting. As fine-tuning strategies, we\ncompare parameter-efficient adapter methods with fine-tuning of all parameters.\nAs cross-lingual transfer strategies, we compare the intermediate-training\n(\\textit{IT}) that uses each language sequentially and cross-lingual validation\n(\\textit{CLV}) that uses a target language already in the validation phase of\nfine-tuning. We assess the success of transfer and the extent of catastrophic\nforgetting in a source language due to cross-lingual transfer, i.e., how much\npreviously acquired knowledge is lost when we learn new information in a\ndifferent language. The results on two different classification problems, hate\nspeech detection and product reviews, each containing datasets in several\nlanguages, show that the \\textit{IT} cross-lingual strategy outperforms\n\\textit{CLV} for the target language. Our findings indicate that, in the\nmajority of cases, the \\textit{CLV} strategy demonstrates superior retention of\nknowledge in the base language (English) compared to the \\textit{IT} strategy,\nwhen evaluating catastrophic forgetting in multiple cross-lingual transfers.\n","authors":["Boshko Koloski","Blaž Škrlj","Marko Robnik-Šikonja","Senja Pollak"],"pdf_url":"https://arxiv.org/pdf/2309.06089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06085v1","updated":"2023-09-12T09:31:25Z","published":"2023-09-12T09:31:25Z","title":"BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation\n Suite for Large Language Models","summary":" The rapid development of Large Language Models (LLMs) and the emergence of\nnovel abilities with scale have necessitated the construction of holistic,\ndiverse and challenging benchmarks such as HELM and BIG-bench. However, at the\nmoment, most of these benchmarks focus only on performance in English and\nevaluations that include Southeast Asian (SEA) languages are few in number. We\ntherefore propose BHASA, a holistic linguistic and cultural evaluation suite\nfor LLMs in SEA languages. It comprises three components: (1) a NLP benchmark\ncovering eight tasks across Natural Language Understanding (NLU), Generation\n(NLG) and Reasoning (NLR) tasks, (2) LINDSEA, a linguistic diagnostic toolkit\nthat spans the gamut of linguistic phenomena including syntax, semantics and\npragmatics, and (3) a cultural diagnostics dataset that probes for both\ncultural representation and sensitivity. For this preliminary effort, we\nimplement the NLP benchmark only for Indonesian, Vietnamese, Thai and Tamil,\nand we only include Indonesian and Tamil for LINDSEA and the cultural\ndiagnostics dataset. As GPT-4 is purportedly one of the best-performing\nmultilingual LLMs at the moment, we use it as a yardstick to gauge the\ncapabilities of LLMs in the context of SEA languages. Our initial experiments\non GPT-4 with BHASA find it lacking in various aspects of linguistic\ncapabilities, cultural representation and sensitivity in the targeted SEA\nlanguages. BHASA is a work in progress and will continue to be improved and\nexpanded in the future.\n","authors":["Wei Qi Leong","Jian Gang Ngui","Yosephine Susanto","Hamsawardhini Rengarajan","Kengatharaiyer Sarveswaran","William Chandra Tjhi"],"pdf_url":"https://arxiv.org/pdf/2309.06085v1.pdf","comment":"86 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.06057v1","updated":"2023-09-12T08:52:56Z","published":"2023-09-12T08:52:56Z","title":"RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic\n Program Repair","summary":" Automatic program repair (APR) is crucial to reduce manual debugging efforts\nfor developers and improve software reliability. While conventional\nsearch-based techniques typically rely on heuristic rules or a redundancy\nassumption to mine fix patterns, recent years have witnessed the surge of deep\nlearning (DL) based approaches to automate the program repair process in a\ndata-driven manner. However, their performance is often limited by a fixed set\nof parameters to model the highly complex search space of APR. To ease such\nburden on the parametric models, in this work, we propose a novel\nRetrieval-Augmented Patch Generation framework (RAP-Gen) by explicitly\nleveraging relevant fix patterns retrieved from a codebase of previous bug-fix\npairs. Specifically, we build a hybrid patch retriever to account for both\nlexical and semantic matching based on the raw source code in a\nlanguage-agnostic manner, which does not rely on any code-specific features. In\naddition, we adapt a code-aware language model CodeT5 as our foundation model\nto facilitate both patch retrieval and generation tasks in a unified manner. We\nadopt a stage-wise approach where the patch retriever first retrieves a\nrelevant external bug-fix pair to augment the buggy input for the CodeT5 patch\ngenerator, which synthesizes a ranked list of repair patch candidates. Notably,\nRAP-Gen is a generic APR framework that can flexibly integrate different patch\nretrievers and generators to repair various types of bugs. We thoroughly\nevaluate RAP-Gen on three benchmarks in two programming languages, including\nthe TFix benchmark in JavaScript, and Code Refinement and Defects4J benchmarks\nin Java, where the bug localization information may or may not be provided.\nExperimental results show that RAP-Gen significantly outperforms previous\nstate-of-the-art approaches on all benchmarks, e.g., repairing 15 more bugs on\n818 Defects4J bugs.\n","authors":["Weishi Wang","Yue Wang","Shafiq Joty","Steven C. H. Hoi"],"pdf_url":"https://arxiv.org/pdf/2309.06057v1.pdf","comment":"FSE 2023, Long paper"},{"id":"http://arxiv.org/abs/2309.06054v1","updated":"2023-09-12T08:45:25Z","published":"2023-09-12T08:45:25Z","title":"How does representation impact in-context learning: A exploration on a\n synthetic task","summary":" In-context learning, i.e., learning from in-context samples, is an impressive\nability of Transformer. However, the mechanism driving the in-context learning\nis not yet fully understood. In this study, we aim to investigate from an\nunderexplored perspective of representation learning. The representation is\nmore complex for in-context learning senario, where the representation can be\nimpacted by both model weights and in-context samples. We refer the above two\nconceptually aspects of representation as in-weight component and in-context\ncomponent, respectively. To study how the two components affect in-context\nlearning capabilities, we construct a novel synthetic task, making it possible\nto device two probes, in-weights probe and in-context probe, to evaluate the\ntwo components, respectively. We demonstrate that the goodness of in-context\ncomponent is highly related to the in-context learning performance, which\nindicates the entanglement between in-context learning and representation\nlearning. Furthermore, we find that a good in-weights component can actually\nbenefit the learning of the in-context component, indicating that in-weights\nlearning should be the foundation of in-context learning. To further understand\nthe the in-context learning mechanism and importance of the in-weights\ncomponent, we proof by construction that a simple Transformer, which uses\npattern matching and copy-past mechanism to perform in-context learning, can\nmatch the in-context learning performance with more complex, best tuned\nTransformer under the perfect in-weights component assumption. In short, those\ndiscoveries from representation learning perspective shed light on new\napproaches to improve the in-context capacity.\n","authors":["Jingwen Fu","Tao Yang","Yuwang Wang","Yan Lu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.06054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04213v2","updated":"2023-09-12T07:19:22Z","published":"2023-09-08T08:54:55Z","title":"UQ at #SMM4H 2023: ALEX for Public Health Analysis with Social Media","summary":" As social media becomes increasingly popular, more and more activities\nrelated to public health emerge. Current techniques for public health analysis\ninvolve popular models such as BERT and large language models (LLMs). However,\nthe costs of training in-domain LLMs for public health are especially\nexpensive. Furthermore, such kinds of in-domain datasets from social media are\ngenerally imbalanced. To tackle these challenges, the data imbalance issue can\nbe overcome by data augmentation and balanced training. Moreover, the ability\nof the LLMs can be effectively utilized by prompting the model properly. In\nthis paper, a novel ALEX framework is proposed to improve the performance of\npublic health analysis on social media by adopting an LLMs explanation\nmechanism. Results show that our ALEX model got the best performance among all\nsubmissions in both Task 2 and Task 4 with a high score in Task 1 in Social\nMedia Mining for Health 2023 (SMM4H)[1]. Our code has been released at https://\ngithub.com/YanJiangJerry/ALEX.\n","authors":["Yan Jiang","Ruihong Qiu","Yi Zhang","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2309.04213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06009v1","updated":"2023-09-12T07:08:22Z","published":"2023-09-12T07:08:22Z","title":"Content Reduction, Surprisal and Information Density Estimation for Long\n Documents","summary":" Many computational linguistic methods have been proposed to study the\ninformation content of languages. We consider two interesting research\nquestions: 1) how is information distributed over long documents, and 2) how\ndoes content reduction, such as token selection and text summarization, affect\nthe information density in long documents. We present four criteria for\ninformation density estimation for long documents, including surprisal,\nentropy, uniform information density, and lexical density. Among those\ncriteria, the first three adopt the measures from information theory. We\npropose an attention-based word selection method for clinical notes and study\nmachine summarization for multiple-domain documents. Our findings reveal the\nsystematic difference in information density of long text in various domains.\nEmpirical results on automated medical coding from long clinical notes show the\neffectiveness of the attention-based word selection method.\n","authors":["Shaoxiong Ji","Wei Sun","Pekka Marttinen"],"pdf_url":"https://arxiv.org/pdf/2309.06009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10652v3","updated":"2023-09-12T06:47:52Z","published":"2023-05-18T02:19:05Z","title":"Speech Separation based on Contrastive Learning and Deep Modularization","summary":" The current monaural state of the art tools for speech separation relies on\nsupervised learning. This means that they must deal with permutation problem,\nthey are impacted by the mismatch on the number of speakers used in training\nand inference. Moreover, their performance heavily relies on the presence of\nhigh-quality labelled data. These problems can be effectively addressed by\nemploying a fully unsupervised technique for speech separation. In this paper,\nwe use contrastive learning to establish the representations of frames then use\nthe learned representations in the downstream deep modularization task.\nConcretely, we demonstrate experimentally that in speech separation, different\nframes of a speaker can be viewed as augmentations of a given hidden standard\nframe of that speaker. The frames of a speaker contain enough prosodic\ninformation overlap which is key in speech separation. Based on this, we\nimplement a self-supervised learning to learn to minimize the distance between\nframes belonging to a given speaker. The learned representations are used in a\ndownstream deep modularization task to cluster frames based on speaker\nidentity. Evaluation of the developed technique on WSJ0-2mix and WSJ0-3mix\nshows that the technique attains SI-SNRi and SDRi of 20.8 and 21.0 respectively\nin WSJ0-2mix. In WSJ0-3mix, it attains SI-SNRi and SDRi of 20.7 and 20.7\nrespectively in WSJ0-2mix. Its greatest strength being that as the number of\nspeakers increase, its performance does not degrade significantly.\n","authors":["Peter Ochieng"],"pdf_url":"https://arxiv.org/pdf/2305.10652v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2212.00369"},{"id":"http://arxiv.org/abs/2309.05973v1","updated":"2023-09-12T05:51:56Z","published":"2023-09-12T05:51:56Z","title":"Circuit Breaking: Removing Model Behaviors with Targeted Ablation","summary":" Language models often exhibit behaviors that improve performance on a\npre-training objective but harm performance on downstream tasks. We propose a\nnovel approach to removing undesirable behaviors by ablating a small number of\ncausal pathways between model components, with the intention of disabling the\ncomputational circuit responsible for the bad behavior. Given a small dataset\nof inputs where the model behaves poorly, we learn to ablate a small number of\nimportant causal pathways. In the setting of reducing GPT-2 toxic language\ngeneration, we find ablating just 12 of the 11.6K causal edges mitigates toxic\ngeneration with minimal degradation of performance on other inputs.\n","authors":["Maximilian Li","Xander Davies","Max Nadeau"],"pdf_url":"https://arxiv.org/pdf/2309.05973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05961v1","updated":"2023-09-12T05:03:28Z","published":"2023-09-12T05:03:28Z","title":"Evaluating the Ebb and Flow: An In-depth Analysis of Question-Answering\n Trends across Diverse Platforms","summary":" Community Question Answering (CQA) platforms steadily gain popularity as they\nprovide users with fast responses to their queries. The swiftness of these\nresponses is contingent on a mixture of query-specific and user-related\nelements. This paper scrutinizes these contributing factors within the context\nof six highly popular CQA platforms, identified through their standout\nanswering speed. Our investigation reveals a correlation between the time taken\nto yield the first response to a question and several variables: the metadata,\nthe formulation of the questions, and the level of interaction among users.\nAdditionally, by employing conventional machine learning models to analyze\nthese metadata and patterns of user interaction, we endeavor to predict which\nqueries will receive their initial responses promptly.\n","authors":["Rima Hazra","Agnik Saha","Somnath Banerjee","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2309.05961v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.05958v1","updated":"2023-09-12T04:49:39Z","published":"2023-09-12T04:49:39Z","title":"The Moral Machine Experiment on Large Language Models","summary":" As large language models (LLMs) become more deeply integrated into various\nsectors, understanding how they make moral judgments has become crucial,\nparticularly in the realm of autonomous driving. This study utilized the Moral\nMachine framework to investigate the ethical decision-making tendencies of\nprominent LLMs, including GPT-3.5, GPT-4, PaLM 2, and Llama 2, comparing their\nresponses to human preferences. While LLMs' and humans' preferences such as\nprioritizing humans over pets and favoring saving more lives are broadly\naligned, PaLM 2 and Llama 2, especially, evidence distinct deviations.\nAdditionally, despite the qualitative similarities between the LLM and human\npreferences, there are significant quantitative disparities, suggesting that\nLLMs might lean toward more uncompromising decisions, compared to the milder\ninclinations of humans. These insights elucidate the ethical frameworks of LLMs\nand their potential implications for autonomous driving.\n","authors":["Kazuhiro Takemoto"],"pdf_url":"https://arxiv.org/pdf/2309.05958v1.pdf","comment":"12 pages, 2 Figures"},{"id":"http://arxiv.org/abs/2308.16349v2","updated":"2023-09-12T04:37:37Z","published":"2023-08-30T22:50:32Z","title":"Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning\n Based on Visually Grounded Conversations","summary":" We introduce Affective Visual Dialog, an emotion explanation and reasoning\ntask as a testbed for research on understanding the formation of emotions in\nvisually grounded conversations. The task involves three skills: (1)\nDialog-based Question Answering (2) Dialog-based Emotion Prediction and (3)\nAffective emotion explanation generation based on the dialog. Our key\ncontribution is the collection of a large-scale dataset, dubbed AffectVisDial,\nconsisting of 50K 10-turn visually grounded dialogs as well as concluding\nemotion attributions and dialog-informed textual emotion explanations,\nresulting in a total of 27,180 working hours. We explain our design decisions\nin collecting the dataset and introduce the questioner and answerer tasks that\nare associated with the participants in the conversation. We train and\ndemonstrate solid Affective Visual Dialog baselines adapted from\nstate-of-the-art models. Remarkably, the responses generated by our models show\npromising emotional reasoning abilities in response to visually grounded\nconversations. Our project page is available at\nhttps://affective-visual-dialog.github.io.\n","authors":["Kilichbek Haydarov","Xiaoqian Shen","Avinash Madasu","Mahmoud Salem","Li-Jia Li","Gamaleldin Elsayed","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.16349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04951v2","updated":"2023-09-12T04:19:49Z","published":"2023-09-10T07:43:42Z","title":"Multi-document Summarization: A Comparative Evaluation","summary":" This paper is aimed at evaluating state-of-the-art models for Multi-document\nSummarization (MDS) on different types of datasets in various domains and\ninvestigating the limitations of existing models to determine future research\ndirections. To address this gap, we conducted an extensive literature review to\nidentify state-of-the-art models and datasets. We analyzed the performance of\nPRIMERA and PEGASUS models on BigSurvey-MDS and MS$^2$ datasets, which posed\nunique challenges due to their varied domains. Our findings show that the\nGeneral-Purpose Pre-trained Model LED outperforms PRIMERA and PEGASUS on the\nMS$^2$ dataset. We used the ROUGE score as a performance metric to evaluate the\nidentified models on different datasets. Our study provides valuable insights\ninto the models' strengths and weaknesses, as well as their applicability in\ndifferent domains. This work serves as a reference for future MDS research and\ncontributes to the development of accurate and robust models which can be\nutilized on demanding datasets with academically and/or scientifically complex\ndata as well as generalized, relatively simple datasets.\n","authors":["Kushan Hewapathirana","Nisansa de Silva","C. D. Athuraliya"],"pdf_url":"https://arxiv.org/pdf/2309.04951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05951v1","updated":"2023-09-12T04:15:34Z","published":"2023-09-12T04:15:34Z","title":"Balanced and Explainable Social Media Analysis for Public Health with\n Large Language Models","summary":" As social media becomes increasingly popular, more and more public health\nactivities emerge, which is worth noting for pandemic monitoring and government\ndecision-making. Current techniques for public health analysis involve popular\nmodels such as BERT and large language models (LLMs). Although recent progress\nin LLMs has shown a strong ability to comprehend knowledge by being fine-tuned\non specific domain datasets, the costs of training an in-domain LLM for every\nspecific public health task are especially expensive. Furthermore, such kinds\nof in-domain datasets from social media are generally highly imbalanced, which\nwill hinder the efficiency of LLMs tuning. To tackle these challenges, the data\nimbalance issue can be overcome by sophisticated data augmentation methods for\nsocial media datasets. In addition, the ability of the LLMs can be effectively\nutilised by prompting the model properly. In light of the above discussion, in\nthis paper, a novel ALEX framework is proposed for social media analysis on\npublic health. Specifically, an augmentation pipeline is developed to resolve\nthe data imbalance issue. Furthermore, an LLMs explanation mechanism is\nproposed by prompting an LLM with the predicted results from BERT models.\nExtensive experiments conducted on three tasks at the Social Media Mining for\nHealth 2023 (SMM4H) competition with the first ranking in two tasks demonstrate\nthe superior performance of the proposed ALEX method. Our code has been\nreleased in https://github.com/YanJiangJerry/ALEX.\n","authors":["Yan Jiang","Ruihong Qiu","Yi Zhang","Peng-Fei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05951v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2309.04213"},{"id":"http://arxiv.org/abs/2309.05950v1","updated":"2023-09-12T04:03:41Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities across a variety of vision and multimodal\ntasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box\nsetting, requiring access to model parameters for backpropagation. However,\nmany VLMs rely on proprietary data and are not open-source, which restricts the\nuse of white-box approaches for fine-tuning. Given that popular private large\nlanguage models (LLMs) like ChatGPT still offer a language-based user\ninterface, we aim to develop a novel fine-tuning approach for VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or output logits. In this setup, we propose employing\nchat-based LLMs as black-box optimizers to search for the best text prompt on\nthe illustrative task of few-shot image classification using CLIP.\nSpecifically, we adopt an automatic \"hill-climbing\" procedure that converges on\nan effective prompt by evaluating the accuracy of current prompts and asking\nLLMs to refine them based on textual feedback, all within a conversational\nprocess without human-in-the-loop. In a challenging 1-shot learning setup, our\nsimple approach surpasses the white-box continuous prompting method CoOp by an\naverage of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms OpenAI's manually crafted prompts and is more efficient than other\nblack-box methods like iterative APE. Additionally, we highlight the advantage\nof conversational feedback incorporating both positive and negative prompts,\nsuggesting that LLMs can utilize the implicit \"gradient\" direction in textual\nfeedback for a more efficient search. Lastly, we find that the text prompts\ngenerated through our strategy are not only more interpretable but also\ntransfer well across different CLIP architectures in a black-box manner.\n","authors":["Samuel Yu","Shihong Liu","Zhiqiu Lin","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15930v2","updated":"2023-09-12T03:41:35Z","published":"2023-08-30T10:12:39Z","title":"LLaSM: Large Language and Speech Model","summary":" Multi-modal large language models have garnered significant interest\nrecently. Though, most of the works focus on vision-language multi-modal models\nproviding strong capabilities in following vision-and-language instructions.\nHowever, we claim that speech is also an important modality through which\nhumans interact with the world. Hence, it is crucial for a general-purpose\nassistant to be able to follow multi-modal speech-and-language instructions. In\nthis work, we propose Large Language and Speech Model (LLaSM). LLaSM is an\nend-to-end trained large multi-modal speech-language model with cross-modal\nconversational abilities, capable of following speech-and-language\ninstructions. Our early experiments show that LLaSM demonstrates a more\nconvenient and natural way for humans to interact with artificial intelligence.\nSpecifically, we also release a large Speech Instruction Following dataset\nLLaSM-Audio-Instructions. Code and demo are available at\nhttps://github.com/LinkSoul-AI/LLaSM and\nhttps://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions\ndataset is available at\nhttps://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions.\n","authors":["Yu Shu","Siwei Dong","Guangyao Chen","Wenhao Huang","Ruihua Zhang","Daochen Shi","Qiqi Xiang","Yemin Shi"],"pdf_url":"https://arxiv.org/pdf/2308.15930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05938v1","updated":"2023-09-12T03:27:08Z","published":"2023-09-12T03:27:08Z","title":"Answering Subjective Induction Questions on Products by Summarizing\n Multi-sources Multi-viewpoints Knowledge","summary":" This paper proposes a new task in the field of Answering Subjective Induction\nQuestion on Products (SUBJPQA). The answer to this kind of question is\nnon-unique, but can be interpreted from many perspectives. For example, the\nanswer to 'whether the phone is heavy' has a variety of different viewpoints. A\nsatisfied answer should be able to summarize these subjective opinions from\nmultiple sources and provide objective knowledge, such as the weight of a\nphone. That is quite different from the traditional QA task, in which the\nanswer to a factoid question is unique and can be found from a single data\nsource. To address this new task, we propose a three-steps method. We first\nretrieve all answer-related clues from multiple knowledge sources on facts and\nopinions. The implicit commonsense facts are also collected to supplement the\nnecessary but missing contexts. We then capture their relevance with the\nquestions by interactive attention. Next, we design a reinforcement-based\nsummarizer to aggregate all these knowledgeable clues. Based on a\ntemplate-controlled decoder, we can output a comprehensive and\nmulti-perspective answer. Due to the lack of a relevant evaluated benchmark set\nfor the new task, we construct a large-scale dataset, named SupQA, consisting\nof 48,352 samples across 15 product domains. Evaluation results show the\neffectiveness of our approach.\n","authors":["Yufeng Zhang","Meng-xiang Wang","Jianxing Yu"],"pdf_url":"https://arxiv.org/pdf/2309.05938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05936v1","updated":"2023-09-12T03:20:50Z","published":"2023-09-12T03:20:50Z","title":"Do PLMs Know and Understand Ontological Knowledge?","summary":" Ontological knowledge, which comprises classes and properties and their\nrelationships, is integral to world knowledge. It is significant to explore\nwhether Pretrained Language Models (PLMs) know and understand such knowledge.\nHowever, existing PLM-probing studies focus mainly on factual knowledge,\nlacking a systematic probing of ontological knowledge. In this paper, we focus\non probing whether PLMs store ontological knowledge and have a semantic\nunderstanding of the knowledge rather than rote memorization of the surface\nform. To probe whether PLMs know ontological knowledge, we investigate how well\nPLMs memorize: (1) types of entities; (2) hierarchical relationships among\nclasses and properties, e.g., Person is a subclass of Animal and Member of\nSports Team is a subproperty of Member of ; (3) domain and range constraints of\nproperties, e.g., the subject of Member of Sports Team should be a Person and\nthe object should be a Sports Team. To further probe whether PLMs truly\nunderstand ontological knowledge beyond memorization, we comprehensively study\nwhether they can reliably perform logical reasoning with given knowledge\naccording to ontological entailment rules. Our probing results show that PLMs\ncan memorize certain ontological knowledge and utilize implicit knowledge in\nreasoning. However, both the memorizing and reasoning performances are less\nthan perfect, indicating incomplete knowledge and understanding.\n","authors":["Weiqi Wu","Chengyue Jiang","Yong Jiang","Pengjun Xie","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2309.05936v1.pdf","comment":"Accepted by ACL 2023 (Outstanding Paper Award)"},{"id":"http://arxiv.org/abs/2308.00264v2","updated":"2023-09-12T02:40:08Z","published":"2023-08-01T03:54:27Z","title":"Multi-Modality Multi-Loss Fusion Network","summary":" In this work we investigate the optimal selection and fusion of features\nacross multiple modalities and combine these in a neural network to improve\nemotion detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nuseful findings relating to subnet performance. Our best model achieves\nstate-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and\nCH-SIMS), and outperforms the other methods in most metrics. We have found that\ntraining on multimodal features improves single modality testing and designing\nfusion methods based on dataset annotation schema enhances model performance.\nThese results suggest a roadmap towards an optimized feature selection and\nfusion approach for enhancing emotion detection in neural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v2.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2309.05922v1","updated":"2023-09-12T02:34:06Z","published":"2023-09-12T02:34:06Z","title":"A Survey of Hallucination in Large Foundation Models","summary":" Hallucination in a foundation model (FM) refers to the generation of content\nthat strays from factual reality or includes fabricated information. This\nsurvey paper provides an extensive overview of recent efforts that aim to\nidentify, elucidate, and tackle the problem of hallucination, with a particular\nfocus on ``Large'' Foundation Models (LFMs). The paper classifies various types\nof hallucination phenomena that are specific to LFMs and establishes evaluation\ncriteria for assessing the extent of hallucination. It also examines existing\nstrategies for mitigating hallucination in LFMs and discusses potential\ndirections for future research in this area. Essentially, the paper offers a\ncomprehensive examination of the challenges and solutions related to\nhallucination in LFMs.\n","authors":["Vipula Rawte","Amit Sheth","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2309.05922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05920v1","updated":"2023-09-12T02:24:16Z","published":"2023-09-12T02:24:16Z","title":"SAGE: Structured Attribute Value Generation for Billion-Scale Product\n Catalogs","summary":" We introduce SAGE; a Generative LLM for inferring attribute values for\nproducts across world-wide e-Commerce catalogs. We introduce a novel\nformulation of the attribute-value prediction problem as a Seq2Seq\nsummarization task, across languages, product types and target attributes. Our\nnovel modeling approach lifts the restriction of predicting attribute values\nwithin a pre-specified set of choices, as well as, the requirement that the\nsought attribute values need to be explicitly mentioned in the text. SAGE can\ninfer attribute values even when such values are mentioned implicitly using\nperiphrastic language, or not-at-all-as is the case for common-sense defaults.\nAdditionally, SAGE is capable of predicting whether an attribute is\ninapplicable for the product at hand, or non-obtainable from the available\ninformation. SAGE is the first method able to tackle all aspects of the\nattribute-value-prediction task as they arise in practical settings in\ne-Commerce catalogs. A comprehensive set of experiments demonstrates the\neffectiveness of the proposed approach, as well as, its superiority against\nstate-of-the-art competing alternatives. Moreover, our experiments highlight\nSAGE's ability to tackle the task of predicting attribute values in zero-shot\nsetting; thereby, opening up opportunities for significantly reducing the\noverall number of labeled examples required for training.\n","authors":["Athanasios N. Nikolakopoulos","Swati Kaul","Siva Karthik Gade","Bella Dubrov","Umit Batur","Suleiman Ali Khan"],"pdf_url":"https://arxiv.org/pdf/2309.05920v1.pdf","comment":"(17 pages)"},{"id":"http://arxiv.org/abs/2309.05918v1","updated":"2023-09-12T02:14:05Z","published":"2023-09-12T02:14:05Z","title":"Stochastic LLMs do not Understand Language: Towards Symbolic,\n Explainable and Ontologically Based LLMs","summary":" In our opinion the exuberance surrounding the relative success of data-driven\nlarge language models (LLMs) is slightly misguided and for several reasons (i)\nLLMs cannot be relied upon for factual information since for LLMs all ingested\ntext (factual or non-factual) was created equal; (ii) due to their subsymbolic\nna-ture, whatever 'knowledge' these models acquire about language will always\nbe buried in billions of microfeatures (weights), none of which is meaningful\non its own; and (iii) LLMs will often fail to make the correct inferences in\nseveral linguistic contexts (e.g., nominal compounds, copredication, quantifier\nscope ambi-guities, intensional contexts. Since we believe the relative success\nof data-driven large language models (LLMs) is not a reflection on the symbolic\nvs. subsymbol-ic debate but a reflection on applying the successful strategy of\na bottom-up reverse engineering of language at scale, we suggest in this paper\napplying the effective bottom-up strategy in a symbolic setting resulting in\nsymbolic, explainable, and ontologically grounded language models.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2309.05918v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2304.05406v2","updated":"2023-09-12T00:42:21Z","published":"2023-04-12T03:02:20Z","title":"Galactic ChitChat: Using Large Language Models to Converse with\n Astronomy Literature","summary":" We demonstrate the potential of the state-of-the-art OpenAI GPT-4 large\nlanguage model to engage in meaningful interactions with Astronomy papers using\nin-context prompting. To optimize for efficiency, we employ a distillation\ntechnique that effectively reduces the size of the original input paper by\n50\\%, while maintaining the paragraph structure and overall semantic integrity.\nWe then explore the model's responses using a multi-document context (ten\ndistilled documents). Our findings indicate that GPT-4 excels in the\nmulti-document domain, providing detailed answers contextualized within the\nframework of related research findings. Our results showcase the potential of\nlarge language models for the astronomical community, offering a promising\navenue for further exploration, particularly the possibility of utilizing the\nmodels for hypothesis generation.\n","authors":["Ioana Ciucă","Yuan-Sen Ting"],"pdf_url":"https://arxiv.org/pdf/2304.05406v2.pdf","comment":"3 pages, published in RNAAS"},{"id":"http://arxiv.org/abs/2212.00301v3","updated":"2023-09-12T00:29:12Z","published":"2022-12-01T06:14:57Z","title":"Learning to Select from Multiple Options","summary":" Many NLP tasks can be regarded as a selection problem from a set of options,\nsuch as classification tasks, multi-choice question answering, etc. Textual\nentailment (TE) has been shown as the state-of-the-art (SOTA) approach to\ndealing with those selection problems. TE treats input texts as premises (P),\noptions as hypotheses (H), then handles the selection problem by modeling (P,\nH) pairwise. Two limitations: first, the pairwise modeling is unaware of other\noptions, which is less intuitive since humans often determine the best options\nby comparing competing candidates; second, the inference process of pairwise TE\nis time-consuming, especially when the option space is large. To deal with the\ntwo issues, this work first proposes a contextualized TE model (Context-TE) by\nappending other k options as the context of the current (P, H) modeling.\nContext-TE is able to learn more reliable decision for the H since it considers\nvarious context. Second, we speed up Context-TE by coming up with Parallel-TE,\nwhich learns the decisions of multiple options simultaneously. Parallel-TE\nsignificantly improves the inference speed while keeping comparable performance\nwith Context-TE. Our methods are evaluated on three tasks (ultra-fine entity\ntyping, intent detection and multi-choice QA) that are typical selection\nproblems with different sizes of options. Experiments show our models set new\nSOTA performance; particularly, Parallel-TE is faster than the pairwise TE by k\ntimes in inference. Our code is publicly available at\nhttps://github.com/jiangshdd/LearningToSelect.\n","authors":["Jiangshu Du","Wenpeng Yin","Congying Xia","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2212.00301v3.pdf","comment":"Accepted by AAAI 2023"},{"id":"http://arxiv.org/abs/2309.06619v1","updated":"2023-09-12T22:22:10Z","published":"2023-09-12T22:22:10Z","title":"RT-LM: Uncertainty-Aware Resource Management for Real-Time Inference of\n Language Models","summary":" Recent advancements in language models (LMs) have gained substantial\nattentions on their capability to generate human-like responses. Though\nexhibiting a promising future for various applications such as conversation AI,\nthese LMs face deployment challenges on various devices due to their extreme\ncomputational cost and unpredictable inference latency. Such varied inference\nlatency, identified as a consequence of uncertainty intrinsic to the nature of\nlanguage, can lead to computational inefficiency and degrade the overall\nperformance of LMs, especially under high-traffic workloads. Unfortunately, the\nbandwidth of these uncertainty sources is extensive, complicating the\nprediction of latency and the effects emanating from such uncertainties. To\nunderstand and mitigate the impact of uncertainty on real-time\nresponse-demanding systems, we take the first step to comprehend, quantify and\noptimize these uncertainty-induced latency performance variations in LMs.\nSpecifically, we present RT-LM, an uncertainty-aware resource management\necosystem for real-time inference of LMs. RT-LM innovatively quantifies how\nspecific input uncertainties, adversely affect latency, often leading to an\nincreased output length. Exploiting these insights, we devise a lightweight yet\neffective method to dynamically correlate input text uncertainties with output\nlength at runtime. Utilizing this quantification as a latency heuristic, we\nintegrate the uncertainty information into a system-level scheduler which\nexplores several uncertainty-induced optimization opportunities, including\nuncertainty-aware prioritization, dynamic consolidation, and strategic CPU\noffloading. Quantitative experiments across five state-of-the-art LMs on two\nhardware platforms demonstrates that RT-LM can significantly reduce the average\nresponse time and improve throughput while incurring a rather small runtime\noverhead.\n","authors":["Yufei Li","Zexin Li","Wei Yang","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06619v1.pdf","comment":"Accepted by RTSS 2023"},{"id":"http://arxiv.org/abs/2304.11520v3","updated":"2023-09-12T21:15:12Z","published":"2023-04-23T03:01:39Z","title":"Processing Natural Language on Embedded Devices: How Well Do Modern\n Models Perform?","summary":" Voice-controlled systems are becoming ubiquitous in many IoT-specific\napplications such as home/industrial automation, automotive infotainment, and\nhealthcare. While cloud-based voice services (\\eg Alexa, Siri) can leverage\nhigh-performance computing servers, some use cases (\\eg robotics, automotive\ninfotainment) may require to execute the natural language processing (NLP)\ntasks offline, often on resource-constrained embedded devices. Large language\nmodels such as BERT and its variants are primarily developed with compute-heavy\nservers in mind. Despite the great performance of BERT models across various\nNLP tasks, their large size and numerous parameters pose substantial obstacles\nto offline computation on embedded systems. Lighter replacement of such\nlanguage models (\\eg DistilBERT and TinyBERT) often sacrifice accuracy,\nparticularly for complex NLP tasks. Until now, it is still unclear \\ca whether\nthe state-of-the-art language models, \\viz BERT and its variants are deployable\non embedded systems with a limited processor, memory, and battery power and \\cb\nif they do, what are the ``right'' set of configurations and parameters to\nchoose for a given NLP task. This paper presents an \\textit{exploratory study\nof modern language models} under different resource constraints and accuracy\nbudgets to derive empirical observations about these resource/accuracy\ntrade-offs. In particular, we study how the four most commonly used BERT-based\nlanguage models (\\eg BERT, RoBERTa, DistilBERT, and TinyBERT) perform on\nembedded systems. We tested them on a Raspberry Pi-based robotic platform with\nthree hardware configurations and four datasets running various NLP tasks. Our\nfindings can help designers to understand the deployability and performance of\nmodern language models, especially those based on BERT architectures, thus\nsaving a lot of time wasted in trial-and-error efforts.\n","authors":["Souvika Sarkar","Mohammad Fakhruddin Babar","Md Mahadi Hassan","Monowar Hasan","Shubhra Kanti Karmaker Santu"],"pdf_url":"https://arxiv.org/pdf/2304.11520v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06589v1","updated":"2023-09-12T20:25:22Z","published":"2023-09-12T20:25:22Z","title":"Do Generative Large Language Models need billions of parameters?","summary":" This paper presents novel systems and methodologies for the development of\nefficient large language models (LLMs). It explores the trade-offs between\nmodel size, performance, and computational resources, with the aim of\nmaximizing the efficiency of these AI systems. The research explores novel\nmethods that allow different parts of the model to share parameters, reducing\nthe total number of unique parameters required. This approach ensures that the\nmodel remains compact without sacrificing its ability to learn and represent\ncomplex language structures. This study provides valuable insights and tools\nfor creating more efficient and effective LLMs, contributing to a more\nsustainable and accessible future for AI language modeling.\n","authors":["Sia Gholami","Marwan Omar"],"pdf_url":"https://arxiv.org/pdf/2309.06589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05605v2","updated":"2023-09-12T20:18:20Z","published":"2023-09-11T16:39:30Z","title":"Memory Injections: Correcting Multi-Hop Reasoning Failures during\n Inference in Transformer-Based Language Models","summary":" Answering multi-hop reasoning questions requires retrieving and synthesizing\ninformation from diverse sources. Large Language Models (LLMs) struggle to\nperform such reasoning consistently. Here we propose an approach to pinpoint\nand rectify multi-hop reasoning failures through targeted memory injections on\nLLM attention heads. First, we analyze the per-layer activations of GPT-2\nmodels in response to single and multi-hop prompts. We then propose a mechanism\nthat allows users to inject pertinent prompt-specific information, which we\nrefer to as \"memories,\" at critical LLM locations during inference. By thus\nenabling the LLM to incorporate additional relevant information during\ninference, we enhance the quality of multi-hop prompt completions. We show\nempirically that a simple, efficient, and targeted memory injection into a key\nattention layer can often increase the probability of the desired next token in\nmulti-hop tasks, by up to 424%.\n","authors":["Mansi Sakarvadia","Aswathy Ajith","Arham Khan","Daniel Grzenda","Nathaniel Hudson","André Bauer","Kyle Chard","Ian Foster"],"pdf_url":"https://arxiv.org/pdf/2309.05605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06541v1","updated":"2023-09-12T19:32:45Z","published":"2023-09-12T19:32:45Z","title":"Text Encoders Lack Knowledge: Leveraging Generative LLMs for\n Domain-Specific Semantic Textual Similarity","summary":" Amidst the sharp rise in the evaluation of large language models (LLMs) on\nvarious tasks, we find that semantic textual similarity (STS) has been\nunder-explored. In this study, we show that STS can be cast as a text\ngeneration problem while maintaining strong performance on multiple STS\nbenchmarks. Additionally, we show generative LLMs significantly outperform\nexisting encoder-based STS models when characterizing the semantic similarity\nbetween two texts with complex semantic relationships dependent on world\nknowledge. We validate this claim by evaluating both generative LLMs and\nexisting encoder-based STS models on three newly collected STS challenge sets\nwhich require world knowledge in the domains of Health, Politics, and Sports.\nAll newly collected data is sourced from social media content posted after May\n2023 to ensure the performance of closed-source models like ChatGPT cannot be\ncredited to memorization. Our results show that, on average, generative LLMs\noutperform the best encoder-only baselines by an average of 22.3% on STS tasks\nrequiring world knowledge. Our results suggest generative language models with\nSTS-specific prompting strategies achieve state-of-the-art performance in\ncomplex, domain-specific STS tasks.\n","authors":["Joseph Gatto","Omar Sharif","Parker Seegmiller","Philip Bohlman","Sarah Masud Preum"],"pdf_url":"https://arxiv.org/pdf/2309.06541v1.pdf","comment":"Under review GEM@EMNLP-2023, 12 pages"},{"id":"http://arxiv.org/abs/2309.06520v1","updated":"2023-09-12T18:51:10Z","published":"2023-09-12T18:51:10Z","title":"Minimum Bayes' Risk Decoding for System Combination of Grammatical Error\n Correction Systems","summary":" For sequence-to-sequence tasks it is challenging to combine individual system\noutputs. Further, there is also often a mismatch between the decoding criterion\nand the one used for assessment. Minimum Bayes' Risk (MBR) decoding can be used\nto combine system outputs in a manner that encourages better alignment with the\nfinal assessment criterion. This paper examines MBR decoding for Grammatical\nError Correction (GEC) systems, where performance is usually evaluated in terms\nof edits and an associated F-score. Hence, we propose a novel MBR loss function\ndirectly linked to this form of criterion. Furthermore, an approach to expand\nthe possible set of candidate sentences is described. This builds on a current\nmax-voting combination scheme, as well as individual edit-level selection.\nExperiments on three popular GEC datasets and with state-of-the-art GEC systems\ndemonstrate the efficacy of the proposed MBR approach. Additionally, the paper\nhighlights how varying reward metrics within the MBR decoding framework can\nprovide control over precision, recall, and the F-score in combined GEC\nsystems.\n","authors":["Vyas Raina","Mark Gales"],"pdf_url":"https://arxiv.org/pdf/2309.06520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10475v2","updated":"2023-09-12T18:51:05Z","published":"2023-07-19T22:14:49Z","title":"Findings of Factify 2: Multimodal Fake News Detection","summary":" With social media usage growing exponentially in the past few years, fake\nnews has also become extremely prevalent. The detrimental impact of fake news\nemphasizes the need for research focused on automating the detection of false\ninformation and verifying its accuracy. In this work, we present the outcome of\nthe Factify 2 shared task, which provides a multi-modal fact verification and\nsatire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data\ncalls for a comparison based approach to the task by pairing social media\nclaims with supporting documents, with both text and image, divided into 5\nclasses based on multi-modal relations. In the second iteration of this task we\nhad over 60 participants and 9 final test-set submissions. The best\nperformances came from the use of DeBERTa for text and Swinv2 and CLIP for\nimage. The highest F1 score averaged for all five classes was 81.82%.\n","authors":["S Suryavardan","Shreyash Mishra","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.10475v2.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2309.06517v1","updated":"2023-09-12T18:47:29Z","published":"2023-09-12T18:47:29Z","title":"Overview of Memotion 3: Sentiment and Emotion Analysis of Codemixed\n Hinglish Memes","summary":" Analyzing memes on the internet has emerged as a crucial endeavor due to the\nimpact this multi-modal form of content wields in shaping online discourse.\nMemes have become a powerful tool for expressing emotions and sentiments,\npossibly even spreading hate and misinformation, through humor and sarcasm. In\nthis paper, we present the overview of the Memotion 3 shared task, as part of\nthe DeFactify 2 workshop at AAAI-23. The task released an annotated dataset of\nHindi-English code-mixed memes based on their Sentiment (Task A), Emotion (Task\nB), and Emotion intensity (Task C). Each of these is defined as an individual\ntask and the participants are ranked separately for each task. Over 50 teams\nregistered for the shared task and 5 made final submissions to the test set of\nthe Memotion 3 dataset. CLIP, BERT modifications, ViT etc. were the most\npopular models among the participants along with approaches such as\nStudent-Teacher model, Fusion, and Ensembling. The best final F1 score for Task\nA is 34.41, Task B is 79.77 and Task C is 59.82.\n","authors":["Shreyash Mishra","S Suryavardan","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2309.06517v1.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2309.06503v1","updated":"2023-09-12T18:18:23Z","published":"2023-09-12T18:18:23Z","title":"Leveraging Large Language Models and Weak Supervision for Social Media\n data annotation: an evaluation using COVID-19 self-reported vaccination\n tweets","summary":" The COVID-19 pandemic has presented significant challenges to the healthcare\nindustry and society as a whole. With the rapid development of COVID-19\nvaccines, social media platforms have become a popular medium for discussions\non vaccine-related topics. Identifying vaccine-related tweets and analyzing\nthem can provide valuable insights for public health research-ers and\npolicymakers. However, manual annotation of a large number of tweets is\ntime-consuming and expensive. In this study, we evaluate the usage of Large\nLanguage Models, in this case GPT-4 (March 23 version), and weak supervision,\nto identify COVID-19 vaccine-related tweets, with the purpose of comparing\nperformance against human annotators. We leveraged a manu-ally curated\ngold-standard dataset and used GPT-4 to provide labels without any additional\nfine-tuning or instructing, in a single-shot mode (no additional prompting).\n","authors":["Ramya Tekumalla","Juan M. Banda"],"pdf_url":"https://arxiv.org/pdf/2309.06503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06490v1","updated":"2023-09-12T18:03:55Z","published":"2023-09-12T18:03:55Z","title":"Leveraging Large Language Models for Automated Dialogue Analysis","summary":" Developing high-performing dialogue systems benefits from the automatic\nidentification of undesirable behaviors in system responses. However, detecting\nsuch behaviors remains challenging, as it draws on a breadth of general\nknowledge and understanding of conversational practices. Although recent\nresearch has focused on building specialized classifiers for detecting specific\ndialogue behaviors, the behavior coverage is still incomplete and there is a\nlack of testing on real-world human-bot interactions. This paper investigates\nthe ability of a state-of-the-art large language model (LLM), ChatGPT-3.5, to\nperform dialogue behavior detection for nine categories in real human-bot\ndialogues. We aim to assess whether ChatGPT can match specialized models and\napproximate human performance, thereby reducing the cost of behavior detection\ntasks. Our findings reveal that neither specialized models nor ChatGPT have yet\nachieved satisfactory results for this task, falling short of human\nperformance. Nevertheless, ChatGPT shows promising potential and often\noutperforms specialized detection models. We conclude with an in-depth\nexamination of the prevalent shortcomings of ChatGPT, offering guidance for\nfuture research to enhance LLM capabilities.\n","authors":["Sarah E. Finch","Ellie S. Paek","Jinho D. Choi"],"pdf_url":"https://arxiv.org/pdf/2309.06490v1.pdf","comment":"Accepted to SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2309.06460v1","updated":"2023-09-12T17:44:40Z","published":"2023-09-12T17:44:40Z","title":"Widely Interpretable Semantic Representation: Frameless Meaning\n Representation for Broader Applicability","summary":" This paper presents a novel semantic representation, WISeR, that overcomes\nchallenges for Abstract Meaning Representation (AMR). Despite its strengths,\nAMR is not easily applied to languages or domains without predefined semantic\nframes, and its use of numbered arguments results in semantic role labels,\nwhich are not directly interpretable and are semantically overloaded for\nparsers. We examine the numbered arguments of predicates in AMR and convert\nthem to thematic roles that do not require reference to semantic frames. We\ncreate a new corpus of 1K English dialogue sentences annotated in both WISeR\nand AMR. WISeR shows stronger inter-annotator agreement for beginner and\nexperienced annotators, with beginners becoming proficient in WISeR annotation\nmore quickly. Finally, we train a state-of-the-art parser on the AMR 3.0 corpus\nand a WISeR corpus converted from AMR 3.0. The parser is evaluated on these\ncorpora and our dialogue corpus. The WISeR model exhibits higher accuracy than\nits AMR counterpart across the board, demonstrating that WISeR is easier for\nparsers to learn.\n","authors":["Lydia Feng","Gregor Williamson","Han He","Jinho D. Choi"],"pdf_url":"https://arxiv.org/pdf/2309.06460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06453v1","updated":"2023-09-12T08:16:58Z","published":"2023-09-12T08:16:58Z","title":"Narrowing the Gap between Supervised and Unsupervised Sentence\n Representation Learning with Large Language Model","summary":" Sentence Representation Learning (SRL) is a fundamental task in Natural\nLanguage Processing (NLP), with Contrastive learning of Sentence Embeddings\n(CSE) as the mainstream technique due to its superior performance. An\nintriguing phenomenon in CSE is the significant performance gap between\nsupervised and unsupervised methods, even when their sentence encoder and loss\nfunction are the same. Previous works attribute this performance gap to\ndifferences in two representation properties (alignment and uniformity).\nHowever, alignment and uniformity only measure the results, which means they\ncannot answer \"What happens during the training process that leads to the\nperformance gap?\" and \"How can the performance gap be narrowed?\". In this\npaper, we conduct empirical experiments to answer these \"What\" and \"How\"\nquestions. We first answer the \"What\" question by thoroughly comparing the\nbehavior of supervised and unsupervised CSE during their respective training\nprocesses. From the comparison, We observe a significant difference in fitting\ndifficulty. Thus, we introduce a metric, called Fitting Difficulty Increment\n(FDI), to measure the fitting difficulty gap between the evaluation dataset and\nthe held-out training dataset, and use the metric to answer the \"What\"\nquestion. Then, based on the insights gained from the \"What\" question, we\ntackle the \"How\" question by increasing the fitting difficulty of the training\ndataset. We achieve this by leveraging the In-Context Learning (ICL) capability\nof the Large Language Model (LLM) to generate data that simulates complex\npatterns. By utilizing the hierarchical patterns in the LLM-generated data, we\neffectively narrow the gap between supervised and unsupervised CSE.\n","authors":["Mingxin Li","Richong Zhang","Zhijie Nie","Yongyi Mao"],"pdf_url":"https://arxiv.org/pdf/2309.06453v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2309.07172v1","updated":"2023-09-12T17:01:02Z","published":"2023-09-12T17:01:02Z","title":"Exploring Large Language Models for Ontology Alignment","summary":" This work investigates the applicability of recent generative Large Language\nModels (LLMs), such as the GPT series and Flan-T5, to ontology alignment for\nidentifying concept equivalence mappings across ontologies. To test the\nzero-shot performance of Flan-T5-XXL and GPT-3.5-turbo, we leverage challenging\nsubsets from two equivalence matching datasets of the OAEI Bio-ML track, taking\ninto account concept labels and structural contexts. Preliminary findings\nsuggest that LLMs have the potential to outperform existing ontology alignment\nsystems like BERTMap, given careful framework and prompt design.\n","authors":["Yuan He","Jiaoyan Chen","Hang Dong","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2309.07172v1.pdf","comment":"Accepted at ISWC 2023 (Posters and Demos)"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.06441v1","updated":"2023-09-12T17:59:36Z","published":"2023-09-12T17:59:36Z","title":"Learning Disentangled Avatars with Hybrid 3D Representations","summary":" Tremendous efforts have been made to learn animatable and photorealistic\nhuman avatars. Towards this end, both explicit and implicit 3D representations\nare heavily studied for a holistic modeling and capture of the whole human\n(e.g., body, clothing, face and hair), but neither representation is an optimal\nchoice in terms of representation efficacy since different parts of the human\navatar have different modeling desiderata. For example, meshes are generally\nnot suitable for modeling clothing and hair. Motivated by this, we present\nDisentangled Avatars~(DELTA), which models humans with hybrid explicit-implicit\n3D representations. DELTA takes a monocular RGB video as input, and produces a\nhuman avatar with separate body and clothing/hair layers. Specifically, we\ndemonstrate two important applications for DELTA. For the first one, we\nconsider the disentanglement of the human body and clothing and in the second,\nwe disentangle the face and hair. To do so, DELTA represents the body or face\nwith an explicit mesh-based parametric 3D model and the clothing or hair with\nan implicit neural radiance field. To make this possible, we design an\nend-to-end differentiable renderer that integrates meshes into volumetric\nrendering, enabling DELTA to learn directly from monocular videos without any\n3D supervision. Finally, we show that how these two applications can be easily\ncombined to model full-body avatars, such that the hair, face, body and\nclothing can be fully disentangled yet jointly rendered. Such a disentanglement\nenables hair and clothing transfer to arbitrary body shapes. We empirically\nvalidate the effectiveness of DELTA's disentanglement by demonstrating its\npromising performance on disentangled reconstruction, virtual clothing try-on\nand hairstyle transfer. To facilitate future research, we also release an\nopen-sourced pipeline for the study of hybrid human avatar modeling.\n","authors":["Yao Feng","Weiyang Liu","Timo Bolkart","Jinlong Yang","Marc Pollefeys","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2309.06441v1.pdf","comment":"home page: https://yfeng95.github.io/delta. arXiv admin note: text\n overlap with arXiv:2210.01868"},{"id":"http://arxiv.org/abs/2309.06440v1","updated":"2023-09-12T17:59:20Z","published":"2023-09-12T17:59:20Z","title":"LEAP Hand: Low-Cost, Efficient, and Anthropomorphic Hand for Robot\n Learning","summary":" Dexterous manipulation has been a long-standing challenge in robotics. While\nmachine learning techniques have shown some promise, results have largely been\ncurrently limited to simulation. This can be mostly attributed to the lack of\nsuitable hardware. In this paper, we present LEAP Hand, a low-cost dexterous\nand anthropomorphic hand for machine learning research. In contrast to previous\nhands, LEAP Hand has a novel kinematic structure that allows maximal dexterity\nregardless of finger pose. LEAP Hand is low-cost and can be assembled in 4\nhours at a cost of 2000 USD from readily available parts. It is capable of\nconsistently exerting large torques over long durations of time. We show that\nLEAP Hand can be used to perform several manipulation tasks in the real world\n-- from visual teleoperation to learning from passive video data and sim2real.\nLEAP Hand significantly outperforms its closest competitor Allegro Hand in all\nour experiments while being 1/8th of the cost. We release detailed assembly\ninstructions, the Sim2Real pipeline and a development platform with useful APIs\non our website at https://leap-hand.github.io/\n","authors":["Kenneth Shaw","Ananye Agarwal","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2309.06440v1.pdf","comment":"Website at https://leap-hand.github.io/"},{"id":"http://arxiv.org/abs/2309.06439v1","updated":"2023-09-12T17:59:10Z","published":"2023-09-12T17:59:10Z","title":"Attention De-sparsification Matters: Inducing Diversity in Digital\n Pathology Representation Learning","summary":" We propose DiRL, a Diversity-inducing Representation Learning technique for\nhistopathology imaging. Self-supervised learning techniques, such as\ncontrastive and non-contrastive approaches, have been shown to learn rich and\neffective representations of digitized tissue samples with limited pathologist\nsupervision. Our analysis of vanilla SSL-pretrained models' attention\ndistribution reveals an insightful observation: sparsity in attention, i.e,\nmodels tends to localize most of their attention to some prominent patterns in\nthe image. Although attention sparsity can be beneficial in natural images due\nto these prominent patterns being the object of interest itself, this can be\nsub-optimal in digital pathology; this is because, unlike natural images,\ndigital pathology scans are not object-centric, but rather a complex phenotype\nof various spatially intermixed biological components. Inadequate\ndiversification of attention in these complex images could result in crucial\ninformation loss. To address this, we leverage cell segmentation to densely\nextract multiple histopathology-specific representations, and then propose a\nprior-guided dense pretext task for SSL, designed to match the multiple\ncorresponding representations between the views. Through this, the model learns\nto attend to various components more closely and evenly, thus inducing adequate\ndiversification in attention for capturing context rich representations.\nThrough quantitative and qualitative analysis on multiple tasks across cancer\ntypes, we demonstrate the efficacy of our method and observe that the attention\nis more globally distributed.\n","authors":["Saarthak Kapse","Srijan Das","Jingwei Zhang","Rajarsi R. Gupta","Joel Saltz","Dimitris Samaras","Prateek Prasanna"],"pdf_url":"https://arxiv.org/pdf/2309.06439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06438v1","updated":"2023-09-12T17:58:06Z","published":"2023-09-12T17:58:06Z","title":"Exploring Non-additive Randomness on ViT against Query-Based Black-Box\n Attacks","summary":" Deep Neural Networks can be easily fooled by small and imperceptible\nperturbations. The query-based black-box attack (QBBA) is able to create the\nperturbations using model output probabilities of image queries requiring no\naccess to the underlying models. QBBA poses realistic threats to real-world\napplications. Recently, various types of robustness have been explored to\ndefend against QBBA. In this work, we first taxonomize the stochastic defense\nstrategies against QBBA. Following our taxonomy, we propose to explore\nnon-additive randomness in models to defend against QBBA. Specifically, we\nfocus on underexplored Vision Transformers based on their flexible\narchitectures. Extensive experiments show that the proposed defense approach\nachieves effective defense, without much sacrifice in performance.\n","authors":["Jindong Gu","Fangyun Wei","Philip Torr","Han Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06438v1.pdf","comment":"Accepted to BMVC2023"},{"id":"http://arxiv.org/abs/2309.06421v1","updated":"2023-09-12T17:37:56Z","published":"2023-09-12T17:37:56Z","title":"AGMDT: Virtual Staining of Renal Histology Images with Adjacency-Guided\n Multi-Domain Transfer","summary":" Renal pathology, as the gold standard of kidney disease diagnosis, requires\ndoctors to analyze a serial of tissue slices stained by H\\&E staining and\nspecial staining like Masson, PASM, and PAS, respectively. These special\nstaining methods are costly, time-consuming, and hard to standardize for wide\nuse especially in primary hospitals. Advances of supervised learning methods\ncan virtually convert H\\&E images into special staining images, but the\npixel-to-pixel alignment is hard to achieve for training. As contrast,\nunsupervised learning methods regarding different stains as different style\ntransferring domains can use unpaired data, but they ignore the spatial\ninter-domain correlations and thus decrease the trustworthiness of structural\ndetails for diagnosis. In this paper, we propose a novel virtual staining\nframework AGMDT to translate images into other domains by avoiding pixel-level\nalignment and meanwhile utilizing the correlations among adjacent tissue\nslices. We first build a high-quality multi-domain renal histological dataset\nwhere each specimen case comprises a series of slices stained in various ways.\nBased on it, the proposed framework AGMDT discovers patch-level aligned pairs\nacross the serial slices of multi-domains through glomerulus detection and\nbipartite graph matching, and utilizes such correlations to supervise the\nend-to-end model for multi-domain staining transformation. Experimental results\nshow that the proposed AGMDT achieves a good balance between the precise\npixel-level alignment and unpaired domain transfer by exploiting correlations\nacross multi-domain serial pathological slices, and outperforms the\nstate-of-the-art methods in both quantitative measure and morphological\ndetails.\n","authors":["Tao Ma","Chao Zhang","Min Lu","Lin Luo"],"pdf_url":"https://arxiv.org/pdf/2309.06421v1.pdf","comment":"Accepted at BMVC 2023"},{"id":"http://arxiv.org/abs/2203.01881v5","updated":"2023-09-12T16:57:06Z","published":"2022-03-03T17:48:23Z","title":"Measuring Self-Supervised Representation Quality for Downstream\n Classification using Discriminative Features","summary":" Self-supervised learning (SSL) has shown impressive results in downstream\nclassification tasks. However, there is limited work in understanding their\nfailure modes and interpreting their learned representations. In this paper, we\nstudy the representation space of state-of-the-art self-supervised models\nincluding SimCLR, SwaV, MoCo, BYOL, DINO, SimSiam, VICReg and Barlow Twins.\nWithout the use of class label information, we discover discriminative features\nthat correspond to unique physical attributes in images, present mostly in\ncorrectly-classified representations. Using these features, we can compress the\nrepresentation space by up to 40% without significantly affecting linear\nclassification performance. We then propose Self-Supervised Representation\nQuality Score (or Q-Score), an unsupervised score that can reliably predict if\na given sample is likely to be mis-classified during linear evaluation,\nachieving AUPRC of 91.45 on ImageNet-100 and 78.78 on ImageNet-1K. Q-Score can\nalso be used as a regularization term on pre-trained encoders to remedy\nlow-quality representations. Fine-tuning with Q-Score regularization can boost\nthe linear probing accuracy of SSL models by up to 5.8% on ImageNet-100 and\n3.7% on ImageNet-1K compared to their baselines. Finally, using gradient\nheatmaps and Salient ImageNet masks, we define a metric to quantify the\ninterpretability of each representation. We show that discriminative features\nare strongly correlated to core attributes and, enhancing these features\nthrough Q-score regularization makes SSL representations more interpretable.\n","authors":["Neha Kalibhat","Kanika Narang","Hamed Firooz","Maziar Sanjabi","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2203.01881v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04278v3","updated":"2023-09-12T16:55:30Z","published":"2023-04-09T16:48:26Z","title":"Point-SLAM: Dense Neural Point Cloud-based SLAM","summary":" We propose a dense neural simultaneous localization and mapping (SLAM)\napproach for monocular RGBD input which anchors the features of a neural scene\nrepresentation in a point cloud that is iteratively generated in an\ninput-dependent data-driven manner. We demonstrate that both tracking and\nmapping can be performed with the same point-based neural scene representation\nby minimizing an RGBD-based re-rendering loss. In contrast to recent dense\nneural SLAM methods which anchor the scene features in a sparse grid, our\npoint-based approach allows dynamically adapting the anchor point density to\nthe information density of the input. This strategy reduces runtime and memory\nusage in regions with fewer details and dedicates higher point density to\nresolve fine details. Our approach performs either better or competitive to\nexisting dense neural RGBD SLAM methods in tracking, mapping and rendering\naccuracy on the Replica, TUM-RGBD and ScanNet datasets. The source code is\navailable at https://github.com/eriksandstroem/Point-SLAM.\n","authors":["Erik Sandström","Yue Li","Luc Van Gool","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2304.04278v3.pdf","comment":"ICCV 2023. 18 Pages, 12 Figures"},{"id":"http://arxiv.org/abs/2211.09302v2","updated":"2023-09-12T16:49:56Z","published":"2022-11-17T02:28:58Z","title":"You Only Label Once: 3D Box Adaptation from Point Cloud to Image via\n Semi-Supervised Learning","summary":" The image-based 3D object detection task expects that the predicted 3D\nbounding box has a ``tightness'' projection (also referred to as cuboid), which\nfits the object contour well on the image while still keeping the geometric\nattribute on the 3D space, e.g., physical dimension, pairwise orthogonal, etc.\nThese requirements bring significant challenges to the annotation. Simply\nprojecting the Lidar-labeled 3D boxes to the image leads to non-trivial\nmisalignment, while directly drawing a cuboid on the image cannot access the\noriginal 3D information. In this work, we propose a learning-based 3D box\nadaptation approach that automatically adjusts minimum parameters of the\n360$^{\\circ}$ Lidar 3D bounding box to perfectly fit the image appearance of\npanoramic cameras. With only a few 2D boxes annotation as guidance during the\ntraining phase, our network can produce accurate image-level cuboid annotations\nwith 3D properties from Lidar boxes. We call our method ``you only label\nonce'', which means labeling on the point cloud once and automatically adapting\nto all surrounding cameras. As far as we know, we are the first to focus on\nimage-level cuboid refinement, which balances the accuracy and efficiency well\nand dramatically reduces the labeling effort for accurate cuboid annotation.\nExtensive experiments on the public Waymo and NuScenes datasets show that our\nmethod can produce human-level cuboid annotation on the image without needing\nmanual adjustment.\n","authors":["Jieqi Shi","Peiliang Li","Xiaozhi Chen","Shaojie Shen"],"pdf_url":"https://arxiv.org/pdf/2211.09302v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06380v1","updated":"2023-09-12T16:42:09Z","published":"2023-09-12T16:42:09Z","title":"InstaFlow: One Step is Enough for High-Quality Diffusion-Based\n Text-to-Image Generation","summary":" Diffusion models have revolutionized text-to-image generation with its\nexceptional quality and creativity. However, its multi-step sampling process is\nknown to be slow, often requiring tens of inference steps to obtain\nsatisfactory results. Previous attempts to improve its sampling speed and\nreduce computational costs through distillation have been unsuccessful in\nachieving a functional one-step model. In this paper, we explore a recent\nmethod called Rectified Flow, which, thus far, has only been applied to small\ndatasets. The core of Rectified Flow lies in its \\emph{reflow} procedure, which\nstraightens the trajectories of probability flows, refines the coupling between\nnoises and images, and facilitates the distillation process with student\nmodels. We propose a novel text-conditioned pipeline to turn Stable Diffusion\n(SD) into an ultra-fast one-step model, in which we find reflow plays a\ncritical role in improving the assignment between noise and images. Leveraging\nour new pipeline, we create, to the best of our knowledge, the first one-step\ndiffusion-based text-to-image generator with SD-level image quality, achieving\nan FID (Frechet Inception Distance) of $23.3$ on MS COCO 2017-5k, surpassing\nthe previous state-of-the-art technique, progressive distillation, by a\nsignificant margin ($37.2$ $\\rightarrow$ $23.3$ in FID). By utilizing an\nexpanded network with 1.7B parameters, we further improve the FID to $22.4$. We\ncall our one-step models \\emph{InstaFlow}. On MS COCO 2014-30k, InstaFlow\nyields an FID of $13.1$ in just $0.09$ second, the best in $\\leq 0.1$ second\nregime, outperforming the recent StyleGAN-T ($13.9$ in $0.1$ second). Notably,\nthe training of InstaFlow only costs 199 A100 GPU days. Project\npage:~\\url{https://github.com/gnobitab/InstaFlow}.\n","authors":["Xingchao Liu","Xiwen Zhang","Jianzhu Ma","Jian Peng","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06370v1","updated":"2023-09-12T16:36:12Z","published":"2023-09-12T16:36:12Z","title":"Padding-free Convolution based on Preservation of Differential\n Characteristics of Kernels","summary":" Convolution is a fundamental operation in image processing and machine\nlearning. Aimed primarily at maintaining image size, padding is a key\ningredient of convolution, which, however, can introduce undesirable boundary\neffects. We present a non-padding-based method for size-keeping convolution\nbased on the preservation of differential characteristics of kernels. The main\nidea is to make convolution over an incomplete sliding window \"collapse\" to a\nlinear differential operator evaluated locally at its central pixel, which no\nlonger requires information from the neighbouring missing pixels. While the\nunderlying theory is rigorous, our final formula turns out to be simple: the\nconvolution over an incomplete window is achieved by convolving its nearest\ncomplete window with a transformed kernel. This formula is computationally\nlightweight, involving neither interpolation or extrapolation nor restrictions\non image and kernel sizes. Our method favours data with smooth boundaries, such\nas high-resolution images and fields from physics. Our experiments include: i)\nfiltering analytical and non-analytical fields from computational physics and,\nii) training convolutional neural networks (CNNs) for the tasks of image\nclassification, semantic segmentation and super-resolution reconstruction. In\nall these experiments, our method has exhibited visible superiority over the\ncompared ones.\n","authors":["Kuangdai Leng","Jeyan Thiyagalingam"],"pdf_url":"https://arxiv.org/pdf/2309.06370v1.pdf","comment":"8 pages, 3 figures, 1 table, ICLMA 2023"},{"id":"http://arxiv.org/abs/2306.05422v2","updated":"2023-09-12T16:32:52Z","published":"2023-06-08T17:59:29Z","title":"Tracking Everything Everywhere All at Once","summary":" We present a new test-time optimization method for estimating dense and\nlong-range motion from a video sequence. Prior optical flow or particle video\ntracking algorithms typically operate within limited temporal windows,\nstruggling to track through occlusions and maintain global consistency of\nestimated motion trajectories. We propose a complete and globally consistent\nmotion representation, dubbed OmniMotion, that allows for accurate, full-length\nmotion estimation of every pixel in a video. OmniMotion represents a video\nusing a quasi-3D canonical volume and performs pixel-wise tracking via\nbijections between local and canonical space. This representation allows us to\nensure global consistency, track through occlusions, and model any combination\nof camera and object motion. Extensive evaluations on the TAP-Vid benchmark and\nreal-world footage show that our approach outperforms prior state-of-the-art\nmethods by a large margin both quantitatively and qualitatively. See our\nproject page for more results: http://omnimotion.github.io/\n","authors":["Qianqian Wang","Yen-Yu Chang","Ruojin Cai","Zhengqi Li","Bharath Hariharan","Aleksander Holynski","Noah Snavely"],"pdf_url":"https://arxiv.org/pdf/2306.05422v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2308.00773v3","updated":"2023-09-12T16:23:09Z","published":"2023-08-01T18:26:55Z","title":"High-Fidelity Eye Animatable Neural Radiance Fields for Human Face","summary":" Face rendering using neural radiance fields (NeRF) is a rapidly developing\nresearch area in computer vision. While recent methods primarily focus on\ncontrolling facial attributes such as identity and expression, they often\noverlook the crucial aspect of modeling eyeball rotation, which holds\nimportance for various downstream tasks. In this paper, we aim to learn a face\nNeRF model that is sensitive to eye movements from multi-view images. We\naddress two key challenges in eye-aware face NeRF learning: how to effectively\ncapture eyeball rotation for training and how to construct a manifold for\nrepresenting eyeball rotation. To accomplish this, we first fit FLAME, a\nwell-established parametric face model, to the multi-view images considering\nmulti-view consistency. Subsequently, we introduce a new Dynamic Eye-aware NeRF\n(DeNeRF). DeNeRF transforms 3D points from different views into a canonical\nspace to learn a unified face NeRF model. We design an eye deformation field\nfor the transformation, including rigid transformation, e.g., eyeball rotation,\nand non-rigid transformation. Through experiments conducted on the ETH-XGaze\ndataset, we demonstrate that our model is capable of generating high-fidelity\nimages with accurate eyeball rotation and non-rigid periocular deformation,\neven under novel viewing angles. Furthermore, we show that utilizing the\nrendered images can effectively enhance gaze estimation performance.\n","authors":["Hengfei Wang","Zhongqun Zhang","Yihua Cheng","Hyung Jin Chang"],"pdf_url":"https://arxiv.org/pdf/2308.00773v3.pdf","comment":"BMVC2023 Oral"},{"id":"http://arxiv.org/abs/2303.09063v2","updated":"2023-09-12T16:14:03Z","published":"2023-03-16T03:43:10Z","title":"Plant Disease Detection using Region-Based Convolutional Neural Network","summary":" Agriculture plays an important role in the food and economy of Bangladesh.\nThe rapid growth of population over the years also has increased the demand for\nfood production. One of the major reasons behind low crop production is\nnumerous bacteria, virus and fungal plant diseases. Early detection of plant\ndiseases and proper usage of pesticides and fertilizers are vital for\npreventing the diseases and boost the yield. Most of the farmers use\ngeneralized pesticides and fertilizers in the entire fields without\nspecifically knowing the condition of the plants. Thus the production cost\noftentimes increases, and, not only that, sometimes this becomes detrimental to\nthe yield. Deep Learning models are found to be very effective to automatically\ndetect plant diseases from images of plants, thereby reducing the need for\nhuman specialists. This paper aims at building a lightweight deep learning\nmodel for predicting leaf disease in tomato plants. By modifying the\nregion-based convolutional neural network, we design an efficient and effective\nmodel that demonstrates satisfactory empirical performance on a benchmark\ndataset. Our proposed model can easily be deployed in a larger system where\ndrones take images of leaves and these images will be fed into our model to\nknow the health condition.\n","authors":["Hasin Rehana","Muhammad Ibrahim","Md. Haider Ali"],"pdf_url":"https://arxiv.org/pdf/2303.09063v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2303.01592v3","updated":"2023-09-12T15:55:23Z","published":"2023-03-02T21:31:35Z","title":"JOSA: Joint surface-based registration with atlas construction enables\n accurate alignment of the brain geometry and function","summary":" Surface-based cortical registration is an important topic in medical image\nanalysis and facilitates many downstream applications. Current approaches for\ncortical registration are mainly driven by geometric features, such as sulcal\ndepth and curvature, and often assume that registration of folding patterns\nleads to alignment of brain function. However, functional variability of\nanatomically corresponding areas across subjects has been widely reported,\nparticularly in higher-order cognitive areas. In this work, we present JOSA, a\nnovel cortical registration framework that jointly models the mismatch between\ngeometry and function while simultaneously learning an unbiased\npopulation-specific atlas. Using a semi-supervised training strategy, JOSA\nachieves superior registration performance in both geometry and function\nwithout requiring functional data at inference. This learning framework can be\nextended to any auxiliary data to guide spherical registration that is\navailable during training but is difficult or impossible to obtain during\ninference, such as parcellations, architectonic identity, transcriptomic\ninformation, and molecular profiles.\n","authors":["Jian Li","Greta Tuckute","Evelina Fedorenko","Brian L. Edlow","Adrian V. Dalca","Bruce Fischl"],"pdf_url":"https://arxiv.org/pdf/2303.01592v3.pdf","comment":"A. V. Dalca and B. Fischl are co-senior authors with equal\n contribution"},{"id":"http://arxiv.org/abs/2309.06337v1","updated":"2023-09-12T15:55:14Z","published":"2023-09-12T15:55:14Z","title":"Exploring Flat Minima for Domain Generalization with Large Learning\n Rates","summary":" Domain Generalization (DG) aims to generalize to arbitrary unseen domains. A\npromising approach to improve model generalization in DG is the identification\nof flat minima. One typical method for this task is SWAD, which involves\naveraging weights along the training trajectory. However, the success of weight\naveraging depends on the diversity of weights, which is limited when training\nwith a small learning rate. Instead, we observe that leveraging a large\nlearning rate can simultaneously promote weight diversity and facilitate the\nidentification of flat regions in the loss landscape. However, employing a\nlarge learning rate suffers from the convergence problem, which cannot be\nresolved by simply averaging the training weights. To address this issue, we\nintroduce a training strategy called Lookahead which involves the weight\ninterpolation, instead of average, between fast and slow weights. The fast\nweight explores the weight space with a large learning rate, which is not\nconverged while the slow weight interpolates with it to ensure the convergence.\nBesides, weight interpolation also helps identify flat minima by implicitly\noptimizing the local entropy loss that measures flatness. To further prevent\noverfitting during training, we propose two variants to regularize the training\nweight with weighted averaged weight or with accumulated history weight. Taking\nadvantage of this new perspective, our methods achieve state-of-the-art\nperformance on both classification and semantic segmentation domain\ngeneralization benchmarks. The code is available at\nhttps://github.com/koncle/DG-with-Large-LR.\n","authors":["Jian Zhang","Lei Qi","Yinghuan Shi","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2309.06337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06335v1","updated":"2023-09-12T15:52:08Z","published":"2023-09-12T15:52:08Z","title":"Grounded Language Acquisition From Object and Action Imagery","summary":" Deep learning approaches to natural language processing have made great\nstrides in recent years. While these models produce symbols that convey vast\namounts of diverse knowledge, it is unclear how such symbols are grounded in\ndata from the world. In this paper, we explore the development of a private\nlanguage for visual data representation by training emergent language (EL)\nencoders/decoders in both i) a traditional referential game environment and ii)\na contrastive learning environment utilizing a within-class matching training\nparadigm. An additional classification layer utilizing neural machine\ntranslation and random forest classification was used to transform symbolic\nrepresentations (sequences of integer symbols) to class labels. These methods\nwere applied in two experiments focusing on object recognition and action\nrecognition. For object recognition, a set of sketches produced by human\nparticipants from real imagery was used (Sketchy dataset) and for action\nrecognition, 2D trajectories were generated from 3D motion capture systems\n(MOVI dataset). In order to interpret the symbols produced for data in each\nexperiment, gradient-weighted class activation mapping (Grad-CAM) methods were\nused to identify pixel regions indicating semantic features which contribute\nevidence towards symbols in learned languages. Additionally, a t-distributed\nstochastic neighbor embedding (t-SNE) method was used to investigate embeddings\nlearned by CNN feature extractors.\n","authors":["James Robert Kubricht","Zhaoyuan Yang","Jianwei Qiu","Peter Henry Tu"],"pdf_url":"https://arxiv.org/pdf/2309.06335v1.pdf","comment":"9 pages, 7 figures, conference"},{"id":"http://arxiv.org/abs/2307.00398v2","updated":"2023-09-12T15:46:23Z","published":"2023-07-01T18:16:06Z","title":"ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models","summary":" Large-scale vision-language models (VLMs) like CLIP successfully find\ncorrespondences between images and text. Through the standard deterministic\nmapping process, an image or a text sample is mapped to a single vector in the\nembedding space. This is problematic: as multiple samples (images or text) can\nabstract the same concept in the physical world, deterministic embeddings do\nnot reflect the inherent ambiguity in the embedding space. We propose ProbVLM,\na probabilistic adapter that estimates probability distributions for the\nembeddings of pre-trained VLMs via inter/intra-modal alignment in a post-hoc\nmanner without needing large-scale datasets or computing. On four challenging\ndatasets, i.e., COCO, Flickr, CUB, and Oxford-flowers, we estimate the\nmulti-modal embedding uncertainties for two VLMs, i.e., CLIP and BLIP, quantify\nthe calibration of embedding uncertainties in retrieval tasks and show that\nProbVLM outperforms other methods. Furthermore, we propose active learning and\nmodel selection as two real-world downstream tasks for VLMs and show that the\nestimated uncertainty aids both tasks. Lastly, we present a novel technique for\nvisualizing the embedding distributions using a large-scale pre-trained latent\ndiffusion model. Code is available at https://github.com/ExplainableML/ProbVLM.\n","authors":["Uddeshya Upadhyay","Shyamgopal Karthik","Massimiliano Mancini","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2307.00398v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2309.05406v2","updated":"2023-09-12T15:45:53Z","published":"2023-09-11T12:12:52Z","title":"Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI\n Generation and Diffuse Glioma Growth Prediction","summary":" Diffuse gliomas are malignant brain tumors that grow widespread through the\nbrain. The complex interactions between neoplastic cells and normal tissue, as\nwell as the treatment-induced changes often encountered, make glioma tumor\ngrowth modeling challenging. In this paper, we present a novel end-to-end\nnetwork capable of generating future tumor masks and realistic MRIs of how the\ntumor will look at any future time points for different treatment plans. Our\nmodel is built upon cutting-edge diffusion probabilistic models and\ndeep-segmentation neural networks. We extended a diffusion model to include\nsequential multi-parametric MRI and treatment information as conditioning input\nto guide the generative diffusion process. This allows us to estimate tumor\ngrowth at any given time point. We trained the model using real-world\npostoperative longitudinal MRI data with glioma tumor growth trajectories\nrepresented as tumor segmentation maps over time. The model has demonstrated\npromising performance across a range of tasks, including the generation of\nhigh-quality synthetic MRIs with tumor masks, time-series tumor segmentations,\nand uncertainty estimation. Combined with the treatment-aware generated MRIs,\nthe tumor growth predictions with uncertainty estimates can provide useful\ninformation for clinical decision-making.\n","authors":["Qinghui Liu","Elies Fuster-Garcia","Ivar Thokle Hovden","Donatas Sederevicius","Karoline Skogen","Bradley J MacIntosh","Edvard Grødem","Till Schellhorn","Petter Brandal","Atle Bjørnerud","Kyrre Eeg Emblem"],"pdf_url":"https://arxiv.org/pdf/2309.05406v2.pdf","comment":"13 pages, 10 figures, 2 tables, 2 agls, preprints in the IEEE trans.\n format for submission to IEEE-TMI"},{"id":"http://arxiv.org/abs/2309.05073v2","updated":"2023-09-12T15:39:30Z","published":"2023-09-10T16:42:11Z","title":"FreeMan: Towards Benchmarking 3D Human Pose Estimation in the Wild","summary":" Estimating the 3D structure of the human body from natural scenes is a\nfundamental aspect of visual perception. This task carries great importance for\nfields like AIGC and human-robot interaction. In practice, 3D human pose\nestimation in real-world settings is a critical initial step in solving this\nproblem. However, the current datasets, often collected under controlled\nlaboratory conditions using complex motion capture equipment and unvarying\nbackgrounds, are insufficient. The absence of real-world datasets is stalling\nthe progress of this crucial task. To facilitate the development of 3D pose\nestimation, we present FreeMan, the first large-scale, real-world multi-view\ndataset. FreeMan was captured by synchronizing 8 smartphones across diverse\nscenarios. It comprises 11M frames from 8000 sequences, viewed from different\nperspectives. These sequences cover 40 subjects across 10 different scenarios,\neach with varying lighting conditions. We have also established an automated,\nprecise labeling pipeline that allows for large-scale processing efficiently.\nWe provide comprehensive evaluation baselines for a range of tasks, underlining\nthe significant challenges posed by FreeMan. Further evaluations of standard\nindoor/outdoor human sensing datasets reveal that FreeMan offers robust\nrepresentation transferability in real and complex scenes. FreeMan is now\npublicly available at https://wangjiongw.github.io/freeman.\n","authors":["Jiong Wang","Fengyu Yang","Wenbo Gou","Bingliang Li","Danqi Yan","Ailing Zeng","Yijun Gao","Junle Wang","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05073v2.pdf","comment":"18 pages, 9 figures. Project page:\n https://wangjiongw.github.io/freeman/ ; API:\n https://github.com/wangjiongw/FreeMan_API"},{"id":"http://arxiv.org/abs/2309.06323v1","updated":"2023-09-12T15:33:09Z","published":"2023-09-12T15:33:09Z","title":"SAMPLING: Scene-adaptive Hierarchical Multiplane Images Representation\n for Novel View Synthesis from a Single Image","summary":" Recent novel view synthesis methods obtain promising results for relatively\nsmall scenes, e.g., indoor environments and scenes with a few objects, but tend\nto fail for unbounded outdoor scenes with a single image as input. In this\npaper, we introduce SAMPLING, a Scene-adaptive Hierarchical Multiplane Images\nRepresentation for Novel View Synthesis from a Single Image based on improved\nmultiplane images (MPI). Observing that depth distribution varies significantly\nfor unbounded outdoor scenes, we employ an adaptive-bins strategy for MPI to\narrange planes in accordance with each scene image. To represent intricate\ngeometry and multi-scale details, we further introduce a hierarchical\nrefinement branch, which results in high-quality synthesized novel views. Our\nmethod demonstrates considerable performance gains in synthesizing large-scale\nunbounded outdoor scenes using a single image on the KITTI dataset and\ngeneralizes well to the unseen Tanks and Temples dataset. The code and models\nwill be made public.\n","authors":["Xiaoyu Zhou","Zhiwei Lin","Xiaojun Shan","Yongtao Wang","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06313v1","updated":"2023-09-12T15:24:26Z","published":"2023-09-12T15:24:26Z","title":"Semantic and Articulated Pedestrian Sensing Onboard a Moving Vehicle","summary":" It is difficult to perform 3D reconstruction from on-vehicle gathered video\ndue to the large forward motion of the vehicle. Even object detection and human\nsensing models perform significantly worse on onboard videos when compared to\nstandard benchmarks because objects often appear far away from the camera\ncompared to the standard object detection benchmarks, image quality is often\ndecreased by motion blur and occlusions occur often. This has led to the\npopularisation of traffic data-specific benchmarks. Recently Light Detection\nAnd Ranging (LiDAR) sensors have become popular to directly estimate depths\nwithout the need to perform 3D reconstructions. However, LiDAR-based methods\nstill lack in articulated human detection at a distance when compared to\nimage-based methods. We hypothesize that benchmarks targeted at articulated\nhuman sensing from LiDAR data could bring about increased research in human\nsensing and prediction in traffic and could lead to improved traffic safety for\npedestrians.\n","authors":["Maria Priisalu"],"pdf_url":"https://arxiv.org/pdf/2309.06313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06308v1","updated":"2023-09-12T15:19:36Z","published":"2023-09-12T15:19:36Z","title":"AI4Food-NutritionFW: A Novel Framework for the Automatic Synthesis and\n Analysis of Eating Behaviours","summary":" Nowadays millions of images are shared on social media and web platforms. In\nparticular, many of them are food images taken from a smartphone over time,\nproviding information related to the individual's diet. On the other hand,\neating behaviours are directly related to some of the most prevalent diseases\nin the world. Exploiting recent advances in image processing and Artificial\nIntelligence (AI), this scenario represents an excellent opportunity to: i)\ncreate new methods that analyse the individuals' health from what they eat, and\nii) develop personalised recommendations to improve nutrition and diet under\nspecific circumstances (e.g., obesity or COVID). Having tunable tools for\ncreating food image datasets that facilitate research in both lines is very\nmuch needed.\n This paper proposes AI4Food-NutritionFW, a framework for the creation of food\nimage datasets according to configurable eating behaviours. AI4Food-NutritionFW\nsimulates a user-friendly and widespread scenario where images are taken using\na smartphone. In addition to the framework, we also provide and describe a\nunique food image dataset that includes 4,800 different weekly eating\nbehaviours from 15 different profiles and 1,200 subjects. Specifically, we\nconsider profiles that comply with actual lifestyles from healthy eating\nbehaviours (according to established knowledge), variable profiles (e.g.,\neating out, holidays), to unhealthy ones (e.g., excess of fast food or sweets).\nFinally, we automatically evaluate a healthy index of the subject's eating\nbehaviours using multidimensional metrics based on guidelines for healthy diets\nproposed by international organisations, achieving promising results (99.53%\nand 99.60% accuracy and sensitivity, respectively). We also release to the\nresearch community a software implementation of our proposed\nAI4Food-NutritionFW and the mentioned food image dataset created with it.\n","authors":["Sergio Romero-Tapiador","Ruben Tolosana","Aythami Morales","Isabel Espinosa-Salinas","Gala Freixer","Julian Fierrez","Ruben Vera-Rodriguez","Enrique Carrillo de Santa Pau","Ana Ramírez de Molina","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2309.06308v1.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.06302v1","updated":"2023-09-12T15:10:23Z","published":"2023-09-12T15:10:23Z","title":"Towards High-Quality Specular Highlight Removal by Leveraging\n Large-Scale Synthetic Data","summary":" This paper aims to remove specular highlights from a single object-level\nimage. Although previous methods have made some progresses, their performance\nremains somewhat limited, particularly for real images with complex specular\nhighlights. To this end, we propose a three-stage network to address them.\nSpecifically, given an input image, we first decompose it into the albedo,\nshading, and specular residue components to estimate a coarse specular-free\nimage. Then, we further refine the coarse result to alleviate its visual\nartifacts such as color distortion. Finally, we adjust the tone of the refined\nresult to match that of the input as closely as possible. In addition, to\nfacilitate network training and quantitative evaluation, we present a\nlarge-scale synthetic dataset of object-level images, covering diverse objects\nand illumination conditions. Extensive experiments illustrate that our network\nis able to generalize well to unseen real object-level images, and even produce\ngood results for scene-level images with multiple background objects and\ncomplex lighting.\n","authors":["Gang Fu","Qing Zhang","Lei Zhu","Chunxia Xiao","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2309.06302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04801v2","updated":"2023-09-12T15:00:36Z","published":"2023-09-09T14:00:39Z","title":"TMComposites: Plug-and-Play Collaboration Between Specialized Tsetlin\n Machines","summary":" Tsetlin Machines (TMs) provide a fundamental shift from arithmetic-based to\nlogic-based machine learning. Supporting convolution, they deal successfully\nwith image classification datasets like MNIST, Fashion-MNIST, and CIFAR-2.\nHowever, the TM struggles with getting state-of-the-art performance on CIFAR-10\nand CIFAR-100, representing more complex tasks. This paper introduces\nplug-and-play collaboration between specialized TMs, referred to as TM\nComposites. The collaboration relies on a TM's ability to specialize during\nlearning and to assess its competence during inference. When teaming up, the\nmost confident TMs make the decisions, relieving the uncertain ones. In this\nmanner, a TM Composite becomes more competent than its members, benefiting from\ntheir specializations. The collaboration is plug-and-play in that members can\nbe combined in any way, at any time, without fine-tuning. We implement three TM\nspecializations in our empirical evaluation: Histogram of Gradients, Adaptive\nGaussian Thresholding, and Color Thermometers. The resulting TM Composite\nincreases accuracy on Fashion-MNIST by two percentage points, CIFAR-10 by\ntwelve points, and CIFAR-100 by nine points, yielding new state-of-the-art\nresults for TMs. Overall, we envision that TM Composites will enable an\nultra-low energy and transparent alternative to state-of-the-art deep learning\non more tasks and datasets.\n","authors":["Ole-Christoffer Granmo"],"pdf_url":"https://arxiv.org/pdf/2309.04801v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2203.02928v4","updated":"2023-09-12T15:00:10Z","published":"2022-03-06T10:14:09Z","title":"Fidelity of Interpretability Methods and Perturbation Artifacts in\n Neural Networks","summary":" Despite excellent performance of deep neural networks (DNNs) in image\nclassification, detection, and prediction, characterizing how DNNs make a given\ndecision remains an open problem, resulting in a number of interpretability\nmethods. Post-hoc interpretability methods primarily aim to quantify the\nimportance of input features with respect to the class probabilities. However,\ndue to the lack of ground truth and the existence of interpretability methods\nwith diverse operating characteristics, evaluating these methods is a crucial\nchallenge. A popular approach to evaluate interpretability methods is to\nperturb input features deemed important for a given prediction and observe the\ndecrease in accuracy. However, perturbation itself may introduce artifacts. We\npropose a method for estimating the impact of such artifacts on the fidelity\nestimation by utilizing model accuracy curves from perturbing input features\naccording to the Most Import First (MIF) and Least Import First (LIF) orders.\nUsing the ResNet-50 trained on the ImageNet, we demonstrate the proposed\nfidelity estimation of four popular post-hoc interpretability methods.\n","authors":["Lennart Brocki","Neo Christopher Chung"],"pdf_url":"https://arxiv.org/pdf/2203.02928v4.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.06288v1","updated":"2023-09-12T14:50:14Z","published":"2023-09-12T14:50:14Z","title":"Self-Training and Multi-Task Learning for Limited Data: Evaluation Study\n on Object Detection","summary":" Self-training allows a network to learn from the predictions of a more\ncomplicated model, thus often requires well-trained teacher models and mixture\nof teacher-student data while multi-task learning jointly optimizes different\ntargets to learn salient interrelationship and requires multi-task annotations\nfor each training example. These frameworks, despite being particularly data\ndemanding have potentials for data exploitation if such assumptions can be\nrelaxed. In this paper, we compare self-training object detection under the\ndeficiency of teacher training data where students are trained on unseen\nexamples by the teacher, and multi-task learning with partially annotated data,\ni.e. single-task annotation per training example. Both scenarios have their own\nlimitation but potentially helpful with limited annotated data. Experimental\nresults show the improvement of performance when using a weak teacher with\nunseen data for training a multi-task student. Despite the limited setup we\nbelieve the experimental results show the potential of multi-task knowledge\ndistillation and self-training, which could be beneficial for future study.\nSource code is at https://lhoangan.github.io/multas.\n","authors":["Hoàng-Ân Lê","Minh-Tan Pham"],"pdf_url":"https://arxiv.org/pdf/2309.06288v1.pdf","comment":"Accepted for International Conference in Computer Vision workshop\n (ICCVW) 2023"},{"id":"http://arxiv.org/abs/2309.06286v1","updated":"2023-09-12T14:46:56Z","published":"2023-09-12T14:46:56Z","title":"Transferability analysis of data-driven additive manufacturing\n knowledge: a case study between powder bed fusion and directed energy\n deposition","summary":" Data-driven research in Additive Manufacturing (AM) has gained significant\nsuccess in recent years. This has led to a plethora of scientific literature to\nemerge. The knowledge in these works consists of AM and Artificial Intelligence\n(AI) contexts that have not been mined and formalized in an integrated way.\nMoreover, no tools or guidelines exist to support data-driven knowledge\ntransfer from one context to another. As a result, data-driven solutions using\nspecific AI techniques are being developed and validated only for specific AM\nprocess technologies. There is a potential to exploit the inherent similarities\nacross various AM technologies and adapt the existing solutions from one\nprocess or problem to another using AI, such as Transfer Learning. We propose a\nthree-step knowledge transferability analysis framework in AM to support\ndata-driven AM knowledge transfer. As a prerequisite to transferability\nanalysis, AM knowledge is featurized into identified knowledge components. The\nframework consists of pre-transfer, transfer, and post-transfer steps to\naccomplish knowledge transfer. A case study is conducted between flagship metal\nAM processes. Laser Powder Bed Fusion (LPBF) is the source of knowledge\nmotivated by its relative matureness in applying AI over Directed Energy\nDeposition (DED), which drives the need for knowledge transfer as the less\nexplored target process. We show successful transfer at different levels of the\ndata-driven solution, including data representation, model architecture, and\nmodel parameters. The pipeline of AM knowledge transfer can be automated in the\nfuture to allow efficient cross-context or cross-process knowledge exchange.\n","authors":["Mutahar Safdar","Jiarui Xie","Hyunwoong Ko","Yan Lu","Guy Lamouche","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.06286v1.pdf","comment":"11 pages, 7 figures. This paper has been accepted to be published in\n the proceedings of IDETC-CIE 2023"},{"id":"http://arxiv.org/abs/2309.06285v1","updated":"2023-09-12T14:43:50Z","published":"2023-09-12T14:43:50Z","title":"Jersey Number Recognition using Keyframe Identification from\n Low-Resolution Broadcast Videos","summary":" Player identification is a crucial component in vision-driven soccer\nanalytics, enabling various downstream tasks such as player assessment, in-game\nanalysis, and broadcast production. However, automatically detecting jersey\nnumbers from player tracklets in videos presents challenges due to motion blur,\nlow resolution, distortions, and occlusions. Existing methods, utilizing\nSpatial Transformer Networks, CNNs, and Vision Transformers, have shown success\nin image data but struggle with real-world video data, where jersey numbers are\nnot visible in most of the frames. Hence, identifying frames that contain the\njersey number is a key sub-problem to tackle. To address these issues, we\npropose a robust keyframe identification module that extracts frames containing\nessential high-level information about the jersey number. A spatio-temporal\nnetwork is then employed to model spatial and temporal context and predict the\nprobabilities of jersey numbers in the video. Additionally, we adopt a\nmulti-task loss function to predict the probability distribution of each digit\nseparately. Extensive evaluations on the SoccerNet dataset demonstrate that\nincorporating our proposed keyframe identification module results in a\nsignificant 37.81% and 37.70% increase in the accuracies of 2 different test\nsets with domain gaps. These results highlight the effectiveness and importance\nof our approach in tackling the challenges of automatic jersey number detection\nin sports videos.\n","authors":["Bavesh Balaji","Jerrin Bright","Harish Prakash","Yuhao Chen","David A Clausi","John Zelek"],"pdf_url":"https://arxiv.org/pdf/2309.06285v1.pdf","comment":"Accepted in the 6th International Workshop on Multimedia Content\n Analysis in Sports (MMSports'23) @ ACM Multimedia"},{"id":"http://arxiv.org/abs/2309.06284v1","updated":"2023-09-12T14:43:47Z","published":"2023-09-12T14:43:47Z","title":"Fg-T2M: Fine-Grained Text-Driven Human Motion Generation via Diffusion\n Model","summary":" Text-driven human motion generation in computer vision is both significant\nand challenging. However, current methods are limited to producing either\ndeterministic or imprecise motion sequences, failing to effectively control the\ntemporal and spatial relationships required to conform to a given text\ndescription. In this work, we propose a fine-grained method for generating\nhigh-quality, conditional human motion sequences supporting precise text\ndescription. Our approach consists of two key components: 1) a\nlinguistics-structure assisted module that constructs accurate and complete\nlanguage feature to fully utilize text information; and 2) a context-aware\nprogressive reasoning module that learns neighborhood and overall semantic\nlinguistics features from shallow and deep graph neural networks to achieve a\nmulti-step inference. Experiments show that our approach outperforms\ntext-driven motion generation methods on HumanML3D and KIT test sets and\ngenerates better visually confirmed motion to the text conditions.\n","authors":["Yin Wang","Zhiying Leng","Frederick W. B. Li","Shun-Cheng Wu","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2309.06284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06282v1","updated":"2023-09-12T14:42:22Z","published":"2023-09-12T14:42:22Z","title":"IBAFormer: Intra-batch Attention Transformer for Domain Generalized\n Semantic Segmentation","summary":" Domain generalized semantic segmentation (DGSS) is a critical yet challenging\ntask, where the model is trained only on source data without access to any\ntarget data. Despite the proposal of numerous DGSS strategies, the\ngeneralization capability remains limited in CNN architectures. Though some\nTransformer-based segmentation models show promising performance, they\nprimarily focus on capturing intra-sample attentive relationships, disregarding\ninter-sample correlations which can potentially benefit DGSS. To this end, we\nenhance the attention modules in Transformer networks for improving DGSS by\nincorporating information from other independent samples in the same batch,\nenriching contextual information, and diversifying the training data for each\nattention block. Specifically, we propose two alternative intra-batch attention\nmechanisms, namely mean-based intra-batch attention (MIBA) and element-wise\nintra-batch attention (EIBA), to capture correlations between different\nsamples, enhancing feature representation and generalization capabilities.\nBuilding upon intra-batch attention, we introduce IBAFormer, which integrates\nself-attention modules with the proposed intra-batch attention for DGSS.\nExtensive experiments demonstrate that IBAFormer achieves SOTA performance in\nDGSS, and ablation studies further confirm the effectiveness of each introduced\ncomponent.\n","authors":["Qiyu Sun","Huilin Chen","Meng Zheng","Ziyan Wu","Michael Felsberg","Yang Tang"],"pdf_url":"https://arxiv.org/pdf/2309.06282v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04342v2","updated":"2023-09-12T14:39:25Z","published":"2023-09-08T14:12:03Z","title":"Revealing the preference for correcting separated aberrations in joint\n optic-image design","summary":" The joint design of the optical system and the downstream algorithm is a\nchallenging and promising task. Due to the demand for balancing the global\noptimal of imaging systems and the computational cost of physical simulation,\nexisting methods cannot achieve efficient joint design of complex systems such\nas smartphones and drones. In this work, starting from the perspective of the\noptical design, we characterize the optics with separated aberrations.\nAdditionally, to bridge the hardware and software without gradients, an image\nsimulation system is presented to reproduce the genuine imaging procedure of\nlenses with large field-of-views. As for aberration correction, we propose a\nnetwork to perceive and correct the spatially varying aberrations and validate\nits superiority over state-of-the-art methods. Comprehensive experiments reveal\nthat the preference for correcting separated aberrations in joint design is as\nfollows: longitudinal chromatic aberration, lateral chromatic aberration,\nspherical aberration, field curvature, and coma, with astigmatism coming last.\nDrawing from the preference, a 10% reduction in the total track length of the\nconsumer-level mobile phone lens module is accomplished. Moreover, this\nprocedure spares more space for manufacturing deviations, realizing\nextreme-quality enhancement of computational photography. The optimization\nparadigm provides innovative insight into the practical joint design of\nsophisticated optical systems and post-processing algorithms.\n","authors":["Jingwen Zhou","Shiqi Chen","Zheng Ren","Wenguan Zhang","Jiapu Yan","Huajun Feng","Qi Li","Yueting Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04342v2.pdf","comment":"submitted to Optica"},{"id":"http://arxiv.org/abs/2309.06276v1","updated":"2023-09-12T14:37:41Z","published":"2023-09-12T14:37:41Z","title":"OTAS: Unsupervised Boundary Detection for Object-Centric Temporal Action\n Segmentation","summary":" Temporal action segmentation is typically achieved by discovering the\ndramatic variances in global visual descriptors. In this paper, we explore the\nmerits of local features by proposing the unsupervised framework of\nObject-centric Temporal Action Segmentation (OTAS). Broadly speaking, OTAS\nconsists of self-supervised global and local feature extraction modules as well\nas a boundary selection module that fuses the features and detects salient\nboundaries for action segmentation. As a second contribution, we discuss the\npros and cons of existing frame-level and boundary-level evaluation metrics.\nThrough extensive experiments, we find OTAS is superior to the previous\nstate-of-the-art method by $41\\%$ on average in terms of our recommended F1\nscore. Surprisingly, OTAS even outperforms the ground-truth human annotations\nin the user study. Moreover, OTAS is efficient enough to allow real-time\ninference.\n","authors":["Yuerong Li","Zhengrong Xue","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2309.06276v1.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2205.10456v3","updated":"2023-09-12T14:22:36Z","published":"2022-05-20T22:47:19Z","title":"PSO-Convolutional Neural Networks with Heterogeneous Learning Rate","summary":" Convolutional Neural Networks (ConvNets or CNNs) have been candidly deployed\nin the scope of computer vision and related fields. Nevertheless, the dynamics\nof training of these neural networks lie still elusive: it is hard and\ncomputationally expensive to train them. A myriad of architectures and training\nstrategies have been proposed to overcome this challenge and address several\nproblems in image processing such as speech, image and action recognition as\nwell as object detection. In this article, we propose a novel Particle Swarm\nOptimization (PSO) based training for ConvNets. In such framework, the vector\nof weights of each ConvNet is typically cast as the position of a particle in\nphase space whereby PSO collaborative dynamics intertwines with Stochastic\nGradient Descent (SGD) in order to boost training performance and\ngeneralization. Our approach goes as follows: i) [regular phase] each ConvNet\nis trained independently via SGD; ii) [collaborative phase] ConvNets share\namong themselves their current vector of weights (or particle-position) along\nwith their gradient estimates of the Loss function. Distinct step sizes are\ncoined by distinct ConvNets. By properly blending ConvNets with large (possibly\nrandom) step-sizes along with more conservative ones, we propose an algorithm\nwith competitive performance with respect to other PSO-based approaches on\nCifar-10 and Cifar-100 (accuracy of 98.31% and 87.48%). These accuracy levels\nare obtained by resorting to only four ConvNets -- such results are expected to\nscale with the number of collaborative ConvNets accordingly. We make our source\ncodes available for download https://github.com/leonlha/PSO-ConvNet-Dynamics.\n","authors":["Nguyen Huu Phong","Augusto Santos","Bernardete Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2205.10456v3.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2309.06262v1","updated":"2023-09-12T14:22:22Z","published":"2023-09-12T14:22:22Z","title":"Modality Unifying Network for Visible-Infrared Person Re-Identification","summary":" Visible-infrared person re-identification (VI-ReID) is a challenging task due\nto large cross-modality discrepancies and intra-class variations. Existing\nmethods mainly focus on learning modality-shared representations by embedding\ndifferent modalities into the same feature space. As a result, the learned\nfeature emphasizes the common patterns across modalities while suppressing\nmodality-specific and identity-aware information that is valuable for Re-ID. To\naddress these issues, we propose a novel Modality Unifying Network (MUN) to\nexplore a robust auxiliary modality for VI-ReID. First, the auxiliary modality\nis generated by combining the proposed cross-modality learner and\nintra-modality learner, which can dynamically model the modality-specific and\nmodality-shared representations to alleviate both cross-modality and\nintra-modality variations. Second, by aligning identity centres across the\nthree modalities, an identity alignment loss function is proposed to discover\nthe discriminative feature representations. Third, a modality alignment loss is\nintroduced to consistently reduce the distribution distance of visible and\ninfrared images by modality prototype modeling. Extensive experiments on\nmultiple public datasets demonstrate that the proposed method surpasses the\ncurrent state-of-the-art methods by a significant margin.\n","authors":["Hao Yu","Xu Cheng","Wei Peng","Weihao Liu","Guoying Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.06262v1.pdf","comment":"11 pages, 5 figures. Accepted as the poster paper in ICCV2023"},{"id":"http://arxiv.org/abs/2309.06255v1","updated":"2023-09-12T14:16:34Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multi-modal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multi-modal cooperation, which could not jointly\nutilize all modalities well. Some methods are proposed to identify and enhance\nthe worse learnt modality, but are often hard to provide the fine-grained\nobservation of multi-modal cooperation at sample-level with theoretical\nsupport. Hence, it is essential to reasonably observe and improve the\nfine-grained cooperation between modalities, especially when facing realistic\nscenarios where the modality discrepancy could vary across different samples.\nTo this end, we introduce a fine-grained modality valuation metric to evaluate\nthe contribution of each modality at sample-level. Via modality valuation, we\nregretfully observe that the multi-modal model tends to rely on one specific\nmodality, resulting in other modalities being low-contributing. We further\nanalyze this issue and improve cooperation between modalities by enhancing the\ndiscriminative ability of low-contributing modalities in a targeted manner.\nOverall, our methods reasonably observe the fine-grained uni-modal contribution\nat sample-level and achieve considerable improvement on different multi-modal\nmodels.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2306.05980v3","updated":"2023-09-12T14:15:12Z","published":"2023-06-09T15:46:42Z","title":"Federated Learning for Medical Image Analysis: A Survey","summary":" Machine learning in medical imaging often faces a fundamental dilemma, namely\nthe small sample size problem. Many recent studies suggest using multi-domain\ndata pooled from different acquisition sites/datasets to improve statistical\npower. However, medical images from different sites cannot be easily shared to\nbuild large datasets for model training due to privacy protection reasons. As a\npromising solution, federated learning, which enables collaborative training of\nmachine learning models based on data from different sites without cross-site\ndata sharing, has attracted considerable attention recently. In this paper, we\nconduct a comprehensive survey of the recent development of federated learning\nmethods in medical image analysis. We first introduce the background and\nmotivation of federated learning for dealing with privacy protection and\ncollaborative learning issues in medical imaging. We then present a\ncomprehensive review of recent advances in federated learning methods for\nmedical image analysis. Specifically, existing methods are categorized based on\nthree critical aspects of a federated learning system, including client end,\nserver end, and communication techniques. In each category, we summarize the\nexisting federated learning methods according to specific research problems in\nmedical image analysis and also provide insights into the motivations of\ndifferent approaches. In addition, we provide a review of existing benchmark\nmedical imaging datasets and software platforms for current federated learning\nresearch. We also conduct an experimental study to empirically evaluate typical\nfederated learning methods for medical image analysis. This survey can help to\nbetter understand the current research status, challenges and potential\nresearch opportunities in this promising research field.\n","authors":["Hao Guan","Pew-Thian Yap","Andrea Bozoki","Mingxia Liu"],"pdf_url":"https://arxiv.org/pdf/2306.05980v3.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2211.07440v2","updated":"2023-09-12T14:07:13Z","published":"2022-11-14T15:14:50Z","title":"Leveraging Automatic Personalised Nutrition: Food Image Recognition\n Benchmark and Dataset based on Nutrition Taxonomy","summary":" Leading a healthy lifestyle has become one of the most challenging goals in\ntoday's society due to our sedentary lifestyle and poor eating habits. As a\nresult, national and international organisms have made numerous efforts to\npromote healthier food diets and physical activity habits. However, these\nrecommendations are sometimes difficult to follow in our daily life and they\nare also based on a general population. As a consequence, a new area of\nresearch, personalised nutrition, has been conceived focusing on individual\nsolutions through smart devices and Artificial Intelligence (AI) methods.\n This study presents the AI4Food-NutritionDB database, the first nutrition\ndatabase that considers food images and a nutrition taxonomy based on\nrecommendations by national and international organisms. In addition, four\ndifferent categorisation levels are considered following nutrition experts: 6\nnutritional levels, 19 main categories (e.g., \"Meat\"), 73 subcategories (e.g.,\n\"White Meat\"), and 893 final food products (e.g., \"Chicken\"). The\nAI4Food-NutritionDB opens the doors to new food computing approaches in terms\nof food intake frequency, quality, and categorisation. Also, in addition to the\ndatabase, we propose a standard experimental protocol and benchmark including\nthree tasks based on the nutrition taxonomy (i.e., category, subcategory, and\nfinal product) to be used for the research community. Finally, we also release\nour Deep Learning models trained with the AI4Food-NutritionDB, which can be\nused as pre-trained models, achieving accurate recognition results with\nchallenging food image databases.\n","authors":["Sergio Romero-Tapiador","Ruben Tolosana","Aythami Morales","Isabel Espinosa-Salinas","Gala Freixer","Julian Fierrez","Ruben Vera-Rodriguez","Enrique Carrillo de Santa Pau","Ana Ramírez de Molina","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2211.07440v2.pdf","comment":"10 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2308.05911v2","updated":"2023-09-12T14:01:07Z","published":"2023-08-11T02:25:58Z","title":"Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object\n Tracking","summary":" Multi-object tracking (MOT) at low frame rates can reduce computational,\nstorage and power overhead to better meet the constraints of edge devices. Many\nexisting MOT methods suffer from significant performance degradation in\nlow-frame-rate videos due to significant location and appearance changes\nbetween adjacent frames. To this end, we propose to explore collaborative\ntracking learning (ColTrack) for frame-rate-insensitive MOT in a query-based\nend-to-end manner. Multiple historical queries of the same target jointly track\nit with richer temporal descriptions. Meanwhile, we insert an information\nrefinement module between every two temporal blocking decoders to better fuse\ntemporal clues and refine features. Moreover, a tracking object consistency\nloss is proposed to guide the interaction between historical queries. Extensive\nexperimental results demonstrate that in high-frame-rate videos, ColTrack\nobtains higher performance than state-of-the-art methods on large-scale\ndatasets Dancetrack and BDD100K, and outperforms the existing end-to-end\nmethods on MOT17. More importantly, ColTrack has a significant advantage over\nstate-of-the-art methods in low-frame-rate videos, which allows it to obtain\nfaster processing speeds by reducing frame-rate requirements while maintaining\nhigher performance. Code will be released at\nhttps://github.com/yolomax/ColTrack\n","authors":["Yiheng Liu","Junta Wu","Yi Fu"],"pdf_url":"https://arxiv.org/pdf/2308.05911v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2307.15615v2","updated":"2023-09-12T13:56:09Z","published":"2023-07-28T15:22:34Z","title":"A survey on deep learning in medical image registration: new\n technologies, uncertainty, evaluation metrics, and beyond","summary":" Deep learning technologies have dramatically reshaped the field of medical\nimage registration over the past decade. The initial developments, such as\nResNet-based and U-Net-based networks, established the foundation for deep\nlearning in image registration. Subsequent progress has been made in various\naspects of deep learning-based registration, including similarity measures,\ndeformation regularizations, and uncertainty estimation. These advancements\nhave not only enriched the field of image registration but have also\nfacilitated its application in a wide range of tasks, including atlas\nconstruction, multi-atlas segmentation, motion estimation, and 2D-3D\nregistration. In this paper, we present a comprehensive overview of the most\nrecent advancements in deep learning-based image registration. We begin with a\nconcise introduction to the core concepts of deep learning-based image\nregistration. Then, we delve into innovative network architectures, loss\nfunctions specific to registration, and methods for estimating registration\nuncertainty. Additionally, this paper explores appropriate evaluation metrics\nfor assessing the performance of deep learning models in registration tasks.\nFinally, we highlight the practical applications of these novel techniques in\nmedical imaging and discuss the future prospects of deep learning-based image\nregistration.\n","authors":["Junyu Chen","Yihao Liu","Shuwen Wei","Zhangxing Bian","Shalini Subramanian","Aaron Carass","Jerry L. Prince","Yong Du"],"pdf_url":"https://arxiv.org/pdf/2307.15615v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02070v2","updated":"2023-09-12T13:43:38Z","published":"2023-02-04T02:47:41Z","title":"Semantic-Guided Generative Image Augmentation Method with Diffusion\n Models for Image Classification","summary":" Existing image augmentation methods consist of two categories:\nperturbation-based methods and generative methods. Perturbation-based methods\napply pre-defined perturbations to augment an original image, but only locally\nvary the image, thus lacking image diversity. In contrast, generative methods\nbring more image diversity in the augmented images but may not preserve\nsemantic consistency, thus incorrectly changing the essential semantics of the\noriginal image. To balance image diversity and semantic consistency in\naugmented images, we propose SGID, a Semantic-guided Generative Image\naugmentation method with Diffusion models for image classification.\nSpecifically, SGID employs diffusion models to generate augmented images with\ngood image diversity. More importantly, SGID takes image labels and captions as\nguidance to maintain semantic consistency between the augmented and original\nimages. Experimental results show that SGID outperforms the best augmentation\nbaseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and\n0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image\naugmentation baselines and further improves the overall performance. We\ndemonstrate the semantic consistency and image diversity of SGID through\nquantitative human and automated evaluations, as well as qualitative case\nstudies.\n","authors":["Bohan Li","Xiao Xu","Xinghao Wang","Yutai Hou","Yunlong Feng","Feng Wang","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2302.02070v2.pdf","comment":"17 pages, 13 figures, 8 tables"},{"id":"http://arxiv.org/abs/2309.06221v1","updated":"2023-09-12T13:41:59Z","published":"2023-09-12T13:41:59Z","title":"Use neural networks to recognize students' handwritten letters and\n incorrect symbols","summary":" Correcting students' multiple-choice answers is a repetitive and mechanical\ntask that can be considered an image multi-classification task. Assuming\npossible options are 'abcd' and the correct option is one of the four, some\nstudents may write incorrect symbols or options that do not exist. In this\npaper, five classifications were set up - four for possible correct options and\none for other incorrect writing. This approach takes into account the\npossibility of non-standard writing options.\n","authors":["JiaJun Zhu","Zichuan Yang","Binjie Hong","Jiacheng Song","Jiwei Wang","Tianhao Chen","Shuilan Yang","Zixun Lan","Fei Ma"],"pdf_url":"https://arxiv.org/pdf/2309.06221v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06219v1","updated":"2023-09-12T13:38:44Z","published":"2023-09-12T13:38:44Z","title":"Human Action Co-occurrence in Lifestyle Vlogs using Graph Link\n Prediction","summary":" We introduce the task of automatic human action co-occurrence identification,\ni.e., determine whether two human actions can co-occur in the same interval of\ntime. We create and make publicly available the ACE (Action Co-occurrencE)\ndataset, consisting of a large graph of ~12k co-occurring pairs of visual\nactions and their corresponding video clips. We describe graph link prediction\nmodels that leverage visual and textual information to automatically infer if\ntwo actions are co-occurring. We show that graphs are particularly well suited\nto capture relations between human actions, and the learned graph\nrepresentations are effective for our task and capture novel and relevant\ninformation across different data domains. The ACE dataset and the code\nintroduced in this paper are publicly available at\nhttps://github.com/MichiganNLP/vlog_action_co-occurrence.\n","authors":["Oana Ignat","Santiago Castro","Weiji Li","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2309.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06207v1","updated":"2023-09-12T13:21:12Z","published":"2023-09-12T13:21:12Z","title":"SGFeat: Salient Geometric Feature for Point Cloud Registration","summary":" Point Cloud Registration (PCR) is a critical and challenging task in computer\nvision. One of the primary difficulties in PCR is identifying salient and\nmeaningful points that exhibit consistent semantic and geometric properties\nacross different scans. Previous methods have encountered challenges with\nambiguous matching due to the similarity among patch blocks throughout the\nentire point cloud and the lack of consideration for efficient global geometric\nconsistency. To address these issues, we propose a new framework that includes\nseveral novel techniques. Firstly, we introduce a semantic-aware geometric\nencoder that combines object-level and patch-level semantic information. This\nencoder significantly improves registration recall by reducing ambiguity in\npatch-level superpoint matching. Additionally, we incorporate a prior knowledge\napproach that utilizes an intrinsic shape signature to identify salient points.\nThis enables us to extract the most salient super points and meaningful dense\npoints in the scene. Secondly, we introduce an innovative transformer that\nencodes High-Order (HO) geometric features. These features are crucial for\nidentifying salient points within initial overlap regions while considering\nglobal high-order geometric consistency. To optimize this high-order\ntransformer further, we introduce an anchor node selection strategy. By\nencoding inter-frame triangle or polyhedron consistency features based on these\nanchor nodes, we can effectively learn high-order geometric features of salient\nsuper points. These high-order features are then propagated to dense points and\nutilized by a Sinkhorn matching module to identify key correspondences for\nsuccessful registration. In our experiments conducted on well-known datasets\nsuch as 3DMatch/3DLoMatch and KITTI, our approach has shown promising results,\nhighlighting the effectiveness of our novel method.\n","authors":["Qianliang Wu","Yaqing Ding","Lei Luo","Chuanwei Zhou","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02203v4","updated":"2023-09-12T13:13:29Z","published":"2023-07-05T10:54:50Z","title":"Neural Fields for Interactive Visualization of Statistical Dependencies\n in 3D Simulation Ensembles","summary":" We present the first neural network that has learned to compactly represent\nand can efficiently reconstruct the statistical dependencies between the values\nof physical variables at different spatial locations in large 3D simulation\nensembles. Going beyond linear dependencies, we consider mutual information as\na measure of non-linear dependence. We demonstrate learning and reconstruction\nwith a large weather forecast ensemble comprising 1000 members, each storing\nmultiple physical variables at a 250 x 352 x 20 simulation grid. By\ncircumventing compute-intensive statistical estimators at runtime, we\ndemonstrate significantly reduced memory and computation requirements for\nreconstructing the major dependence structures. This enables embedding the\nestimator into a GPU-accelerated direct volume renderer and interactively\nvisualizing all mutual dependencies for a selected domain point.\n","authors":["Fatemeh Farokhmanesh","Kevin Höhlein","Christoph Neuhauser","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2307.02203v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06202v1","updated":"2023-09-12T13:10:06Z","published":"2023-09-12T13:10:06Z","title":"Fast Sparse PCA via Positive Semidefinite Projection for Unsupervised\n Feature Selection","summary":" In the field of unsupervised feature selection, sparse principal component\nanalysis (SPCA) methods have attracted more and more attention recently.\nCompared to spectral-based methods, SPCA methods don't rely on the construction\nof a similarity matrix and show better feature selection ability on real-world\ndata. The original SPCA formulates a nonconvex optimization problem. Existing\nconvex SPCA methods reformulate SPCA as a convex model by regarding the\nreconstruction matrix as an optimization variable. However, they are lack of\nconstraints equivalent to the orthogonality restriction in SPCA, leading to\nlarger solution space. In this paper, it's proved that the optimal solution to\na convex SPCA model falls onto the Positive Semidefinite (PSD) cone. A standard\nconvex SPCA-based model with PSD constraint for unsupervised feature selection\nis proposed. Further, a two-step fast optimization algorithm via PSD projection\nis presented to solve the proposed model. Two other existing convex SPCA-based\nmodels are also proven to have their solutions optimized on the PSD cone in\nthis paper. Therefore, the PSD versions of these two models are proposed to\naccelerate their convergence as well. We also provide a regularization\nparameter setting strategy for our proposed method. Experiments on synthetic\nand real-world datasets demonstrate the effectiveness and efficiency of the\nproposed methods.\n","authors":["Junjing Zheng","Xinyu Zhang","Yongxiang Liu","Weidong Jiang","Kai Huo","Li Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10042v2","updated":"2023-09-12T13:08:53Z","published":"2023-03-17T15:12:25Z","title":"ShaRPy: Shape Reconstruction and Hand Pose Estimation from RGB-D with\n Uncertainty","summary":" Despite their potential, markerless hand tracking technologies are not yet\napplied in practice to the diagnosis or monitoring of the activity in\ninflammatory musculoskeletal diseases. One reason is that the focus of most\nmethods lies in the reconstruction of coarse, plausible poses, whereas in the\nclinical context, accurate, interpretable, and reliable results are required.\nTherefore, we propose ShaRPy, the first RGB-D Shape Reconstruction and hand\nPose tracking system, which provides uncertainty estimates of the computed\npose, e.g., when a finger is hidden or its estimate is inconsistent with the\nobservations in the input, to guide clinical decision-making. Besides pose,\nShaRPy approximates a personalized hand shape, promoting a more realistic and\nintuitive understanding of its digital twin. Our method requires only a\nlight-weight setup with a single consumer-level RGB-D camera yet it is able to\ndistinguish similar poses with only small joint angle deviations in a\nmetrically accurate space. This is achieved by combining a data-driven dense\ncorrespondence predictor with traditional energy minimization. To bridge the\ngap between interactive visualization and biomedical simulation we leverage a\nparametric hand model in which we incorporate biomedical constraints and\noptimize for both, its pose and hand shape. We evaluate ShaRPy on a keypoint\ndetection benchmark and show qualitative results of hand function assessments\nfor activity monitoring of musculoskeletal diseases.\n","authors":["Vanessa Wirth","Anna-Maria Liphardt","Birte Coppers","Johanna Bräunig","Simon Heinrich","Sigrid Leyendecker","Arnd Kleyer","Georg Schett","Martin Vossiek","Bernhard Egger","Marc Stamminger"],"pdf_url":"https://arxiv.org/pdf/2303.10042v2.pdf","comment":"Accepted at ICCVW (CVAMD) 2023"},{"id":"http://arxiv.org/abs/2309.06199v1","updated":"2023-09-12T13:08:46Z","published":"2023-09-12T13:08:46Z","title":"SCP: Scene Completion Pre-training for 3D Object Detection","summary":" 3D object detection using LiDAR point clouds is a fundamental task in the\nfields of computer vision, robotics, and autonomous driving. However, existing\n3D detectors heavily rely on annotated datasets, which are both time-consuming\nand prone to errors during the process of labeling 3D bounding boxes. In this\npaper, we propose a Scene Completion Pre-training (SCP) method to enhance the\nperformance of 3D object detectors with less labeled data. SCP offers three key\nadvantages: (1) Improved initialization of the point cloud model. By completing\nthe scene point clouds, SCP effectively captures the spatial and semantic\nrelationships among objects within urban environments. (2) Elimination of the\nneed for additional datasets. SCP serves as a valuable auxiliary network that\ndoes not impose any additional efforts or data requirements on the 3D\ndetectors. (3) Reduction of the amount of labeled data for detection. With the\nhelp of SCP, the existing state-of-the-art 3D detectors can achieve comparable\nperformance while only relying on 20% labeled data.\n","authors":["Yiming Shan","Yan Xia","Yuhong Chen","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2309.06199v1.pdf","comment":"Wins the best paper award at ISPRS Geospatial Week 2023"},{"id":"http://arxiv.org/abs/2309.06197v1","updated":"2023-09-12T13:04:41Z","published":"2023-09-12T13:04:41Z","title":"360$^\\circ$ from a Single Camera: A Few-Shot Approach for LiDAR\n Segmentation","summary":" Deep learning applications on LiDAR data suffer from a strong domain gap when\napplied to different sensors or tasks. In order for these methods to obtain\nsimilar accuracy on different data in comparison to values reported on public\nbenchmarks, a large scale annotated dataset is necessary. However, in practical\napplications labeled data is costly and time consuming to obtain. Such factors\nhave triggered various research in label-efficient methods, but a large gap\nremains to their fully-supervised counterparts. Thus, we propose ImageTo360, an\neffective and streamlined few-shot approach to label-efficient LiDAR\nsegmentation. Our method utilizes an image teacher network to generate semantic\npredictions for LiDAR data within a single camera view. The teacher is used to\npretrain the LiDAR segmentation student network, prior to optional fine-tuning\non 360$^\\circ$ data. Our method is implemented in a modular manner on the point\nlevel and as such is generalizable to different architectures. We improve over\nthe current state-of-the-art results for label-efficient methods and even\nsurpass some traditional fully-supervised segmentation networks.\n","authors":["Laurenz Reichardt","Nikolas Ebert","Oliver Wasenmüller"],"pdf_url":"https://arxiv.org/pdf/2309.06197v1.pdf","comment":"ICCV Workshop 2023"},{"id":"http://arxiv.org/abs/2309.06194v1","updated":"2023-09-12T13:03:32Z","published":"2023-09-12T13:03:32Z","title":"A 3M-Hybrid Model for the Restoration of Unique Giant Murals: A Case\n Study on the Murals of Yongle Palace","summary":" The Yongle Palace murals, as valuable cultural heritage, have suffered\nvarying degrees of damage, making their restoration of significant importance.\nHowever, the giant size and unique data of Yongle Palace murals present\nchallenges for existing deep-learning based restoration methods: 1) The\ndistinctive style introduces domain bias in traditional transfer learning-based\nrestoration methods, while the scarcity of mural data further limits the\napplicability of these methods. 2) Additionally, the giant size of these murals\nresults in a wider range of defect types and sizes, necessitating models with\ngreater adaptability. Consequently, there is a lack of focus on deep\nlearning-based restoration methods for the unique giant murals of Yongle\nPalace. Here, a 3M-Hybrid model is proposed to address these challenges.\nFirstly, based on the characteristic that the mural data frequency is prominent\nin the distribution of low and high frequency features, high and low frequency\nfeatures are separately abstracted for complementary learning. Furthermore, we\nintegrate a pre-trained Vision Transformer model (VIT) into the CNN module,\nallowing us to leverage the benefits of a large model while mitigating domain\nbias. Secondly, we mitigate seam and structural distortion issues resulting\nfrom the restoration of large defects by employing a multi-scale and\nmulti-perspective strategy, including data segmentation and fusion.\nExperimental results demonstrate the efficacy of our proposed model. In\nregular-sized mural restoration, it improves SSIM and PSNR by 14.61% and 4.73%,\nrespectively, compared to the best model among four representative CNN models.\nAdditionally, it achieves favorable results in the final restoration of giant\nmurals.\n","authors":["Jing Yang","Nur Intan Raihana Ruhaiyem","Chichun Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.06194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06188v1","updated":"2023-09-12T12:54:12Z","published":"2023-09-12T12:54:12Z","title":"Computer Vision Pipeline for Automated Antarctic Krill Analysis","summary":" British Antarctic Survey (BAS) researchers launch annual expeditions to the\nAntarctic in order to estimate Antarctic Krill biomass and assess the change\nfrom previous years. These comparisons provide insight into the effects of the\ncurrent environment on this key component of the marine food chain. In this\nwork we have developed tools for automating the data collection and analysis\nprocess, using web-based image annotation tools and deep learning image\nclassification and regression models. We achieve highly accurate krill instance\nsegmentation results with an average 77.28% AP score, as well as separate\nmaturity stage and length estimation of krill specimens with 62.99% accuracy\nand a 1.96 mm length error respectively.\n","authors":["Mazvydas Gudelis","Michal Mackiewicz","Julie Bremner","Sophie Fielding"],"pdf_url":"https://arxiv.org/pdf/2309.06188v1.pdf","comment":"Submitted to MVEO 2023 @ BMVC 2023"},{"id":"http://arxiv.org/abs/2309.06176v1","updated":"2023-09-12T12:43:50Z","published":"2023-09-12T12:43:50Z","title":"Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding","summary":" Make-up temporal video grounding (MTVG) aims to localize the target video\nsegment which is semantically related to a sentence describing a make-up\nactivity, given a long video. Compared with the general video grounding task,\nMTVG focuses on meticulous actions and changes on the face. The make-up\ninstruction step, usually involving detailed differences in products and facial\nareas, is more fine-grained than general activities (e.g, cooking activity and\nfurniture assembly). Thus, existing general approaches cannot locate the target\nactivity effectually. More specifically, existing proposal generation modules\nare not yet fully developed in providing semantic cues for the more\nfine-grained make-up semantic comprehension. To tackle this issue, we propose\nan effective proposal-based framework named Dual-Path Temporal Map Optimization\nNetwork (DPTMO) to capture fine-grained multimodal semantic details of make-up\nactivities. DPTMO extracts both query-agnostic and query-guided features to\nconstruct two proposal sets and uses specific evaluation methods for the two\nsets. Different from the commonly used single structure in previous methods,\nour dual-path structure can mine more semantic information in make-up videos\nand distinguish fine-grained actions well. These two candidate sets represent\nthe cross-modal makeup video-text similarity and multi-modal fusion\nrelationship, complementing each other. Each set corresponds to its respective\noptimization perspective, and their joint prediction enhances the accuracy of\nvideo timestamp prediction. Comprehensive experiments on the YouMakeup dataset\ndemonstrate our proposed dual structure excels in fine-grained semantic\ncomprehension.\n","authors":["Jiaxiu Li","Kun Li","Jia Li","Guoliang Chen","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06169v1","updated":"2023-09-12T12:27:17Z","published":"2023-09-12T12:27:17Z","title":"Elucidating the solution space of extended reverse-time SDE for\n diffusion models","summary":" Diffusion models (DMs) demonstrate potent image generation capabilities in\nvarious generative modeling tasks. Nevertheless, their primary limitation lies\nin slow sampling speed, requiring hundreds or thousands of sequential function\nevaluations through large neural networks to generate high-quality images.\nSampling from DMs can be seen as solving corresponding stochastic differential\nequations (SDEs) or ordinary differential equations (ODEs). In this work, we\nformulate the sampling process as an extended reverse-time SDE (ER SDE),\nunifying prior explorations into ODEs and SDEs. Leveraging the semi-linear\nstructure of ER SDE solutions, we offer exact solutions and arbitrarily\nhigh-order approximate solutions for VP SDE and VE SDE, respectively. Based on\nthe solution space of the ER SDE, we yield mathematical insights elucidating\nthe superior performance of ODE solvers over SDE solvers in terms of fast\nsampling. Additionally, we unveil that VP SDE solvers stand on par with their\nVE SDE counterparts. Finally, we devise fast and training-free samplers, ER-SDE\nSolvers, elevating the efficiency of stochastic samplers to unprecedented\nlevels. Experimental results demonstrate achieving 3.45 FID in 20 function\nevaluations and 2.24 FID in 50 function evaluations on the ImageNet\n64$\\times$64 dataset.\n","authors":["Qinpeng Cui","Xinyi Zhang","Zongqing Lu","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2309.06169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06166v1","updated":"2023-09-12T12:23:49Z","published":"2023-09-12T12:23:49Z","title":"Certified Robust Models with Slack Control and Large Lipschitz Constants","summary":" Despite recent success, state-of-the-art learning-based models remain highly\nvulnerable to input changes such as adversarial examples. In order to obtain\ncertifiable robustness against such perturbations, recent work considers\nLipschitz-based regularizers or constraints while at the same time increasing\nprediction margin. Unfortunately, this comes at the cost of significantly\ndecreased accuracy. In this paper, we propose a Calibrated Lipschitz-Margin\nLoss (CLL) that addresses this issue and improves certified robustness by\ntackling two problems: Firstly, commonly used margin losses do not adjust the\npenalties to the shrinking output distribution; caused by minimizing the\nLipschitz constant $K$. Secondly, and most importantly, we observe that\nminimization of $K$ can lead to overly smooth decision functions. This limits\nthe model's complexity and thus reduces accuracy. Our CLL addresses these\nissues by explicitly calibrating the loss w.r.t. margin and Lipschitz constant,\nthereby establishing full control over slack and improving robustness\ncertificates even with larger Lipschitz constants. On CIFAR-10, CIFAR-100 and\nTiny-ImageNet, our models consistently outperform losses that leave the\nconstant unattended. On CIFAR-100 and Tiny-ImageNet, CLL improves upon\nstate-of-the-art deterministic $L_2$ robust accuracies. In contrast to current\ntrends, we unlock potential of much smaller models without $K=1$ constraints.\n","authors":["Max Losch","David Stutz","Bernt Schiele","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2309.06166v1.pdf","comment":"To be published at GCPR 2023"},{"id":"http://arxiv.org/abs/2207.06968v5","updated":"2023-09-12T12:14:56Z","published":"2022-07-14T14:53:50Z","title":"DASS: Differentiable Architecture Search for Sparse neural networks","summary":" The deployment of Deep Neural Networks (DNNs) on edge devices is hindered by\nthe substantial gap between performance requirements and available processing\npower. While recent research has made significant strides in developing pruning\nmethods to build a sparse network for reducing the computing overhead of DNNs,\nthere remains considerable accuracy loss, especially at high pruning ratios. We\nfind that the architectures designed for dense networks by differentiable\narchitecture search methods are ineffective when pruning mechanisms are applied\nto them. The main reason is that the current method does not support sparse\narchitectures in their search space and uses a search objective that is made\nfor dense networks and does not pay any attention to sparsity. In this paper,\nwe propose a new method to search for sparsity-friendly neural architectures.\nWe do this by adding two new sparse operations to the search space and\nmodifying the search objective. We propose two novel parametric SparseConv and\nSparseLinear operations in order to expand the search space to include sparse\noperations. In particular, these operations make a flexible search space due to\nusing sparse parametric versions of linear and convolution operations. The\nproposed search objective lets us train the architecture based on the sparsity\nof the search space operations. Quantitative analyses demonstrate that our\nsearch architectures outperform those used in the stateof-the-art sparse\nnetworks on the CIFAR-10 and ImageNet datasets. In terms of performance and\nhardware effectiveness, DASS increases the accuracy of the sparse version of\nMobileNet-v2 from 73.44% to 81.35% (+7.91% improvement) with 3.87x faster\ninference time.\n","authors":["Hamid Mousavi","Mohammad Loni","Mina Alibeigi","Masoud Daneshtalab"],"pdf_url":"https://arxiv.org/pdf/2207.06968v5.pdf","comment":"18 pages with 12 figures"},{"id":"http://arxiv.org/abs/2309.06159v1","updated":"2023-09-12T12:01:40Z","published":"2023-09-12T12:01:40Z","title":"Active Label Refinement for Semantic Segmentation of Satellite Images","summary":" Remote sensing through semantic segmentation of satellite images contributes\nto the understanding and utilisation of the earth's surface. For this purpose,\nsemantic segmentation networks are typically trained on large sets of labelled\nsatellite images. However, obtaining expert labels for these images is costly.\nTherefore, we propose to rely on a low-cost approach, e.g. crowdsourcing or\npretrained networks, to label the images in the first step. Since these initial\nlabels are partially erroneous, we use active learning strategies to\ncost-efficiently refine the labels in the second step. We evaluate the active\nlearning strategies using satellite images of Bengaluru in India, labelled with\nland cover and land use labels. Our experimental results suggest that an active\nlabel refinement to improve the semantic segmentation network's performance is\nbeneficial.\n","authors":["Tuan Pham Minh","Jayan Wijesingha","Daniel Kottke","Marek Herde","Denis Huseljic","Bernhard Sick","Michael Wachendorf","Thomas Esch"],"pdf_url":"https://arxiv.org/pdf/2309.06159v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01703v2","updated":"2023-09-12T11:59:21Z","published":"2023-07-04T13:18:39Z","title":"Learning to Augment: Hallucinating Data for Domain Generalized\n Segmentation","summary":" Domain generalized semantic segmentation (DGSS) is an essential but highly\nchallenging task, in which the model is trained only on source data and any\ntarget data is not available. Existing DGSS methods primarily standardize the\nfeature distribution or utilize extra domain data for augmentation. However,\nthe former sacrifices valuable information and the latter introduces domain\nbiases. Therefore, generating diverse-style source data without auxiliary data\nemerges as an attractive strategy. In light of this, we propose GAN-based\nfeature augmentation (GBFA) that hallucinates stylized feature maps while\npreserving their semantic contents with a feature generator. The impressive\ngenerative capability of GANs enables GBFA to perform inter-channel and\ntrainable feature synthesis in an end-to-end framework. To enable learning\nGBFA, we introduce random image color augmentation (RICA), which adds a diverse\nrange of variations to source images during training. These augmented images\nare then passed through a feature extractor to obtain features tailored for\nGBFA training. Both GBFA and RICA operate exclusively within the source domain,\neliminating the need for auxiliary datasets. We conduct extensive experiments,\nand the generalization results from the synthetic GTAV and SYNTHIA to the real\nCityscapes, BDDS, and Mapillary datasets show that our method achieves\nstate-of-the-art performance in DGSS.\n","authors":["Qiyu Sun","Pavlo Melnyk","Michael Felsberg","Yang Tang"],"pdf_url":"https://arxiv.org/pdf/2307.01703v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06143v1","updated":"2023-09-12T11:29:35Z","published":"2023-09-12T11:29:35Z","title":"Improving Generalization Capability of Deep Learning-Based Nuclei\n Instance Segmentation by Non-deterministic Train Time and Deterministic Test\n Time Stain Normalization","summary":" With the advent of digital pathology and microscopic systems that can scan\nand save whole slide histological images automatically, there is a growing\ntrend to use computerized methods to analyze acquired images. Among different\nhistopathological image analysis tasks, nuclei instance segmentation plays a\nfundamental role in a wide range of clinical and research applications. While\nmany semi- and fully-automatic computerized methods have been proposed for\nnuclei instance segmentation, deep learning (DL)-based approaches have been\nshown to deliver the best performances. However, the performance of such\napproaches usually degrades when tested on unseen datasets.\n In this work, we propose a novel approach to improve the generalization\ncapability of a DL-based automatic segmentation approach. Besides utilizing one\nof the state-of-the-art DL-based models as a baseline, our method incorporates\nnon-deterministic train time and deterministic test time stain normalization.\nWe trained the model with one single training set and evaluated its\nsegmentation performance on seven test datasets. Our results show that the\nproposed method provides up to 5.77%, 5.36%, and 5.27% better performance in\nsegmenting nuclei based on Dice score, aggregated Jaccard index, and panoptic\nquality score, respectively, compared to the baseline segmentation model.\n","authors":["Amirreza Mahbod","Georg Dorffner","Isabella Ellinger","Ramona Woitek","Sepideh Hatamikia"],"pdf_url":"https://arxiv.org/pdf/2309.06143v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2309.06142v1","updated":"2023-09-12T11:29:12Z","published":"2023-09-12T11:29:12Z","title":"Towards Reliable Domain Generalization: A New Dataset and Evaluations","summary":" There are ubiquitous distribution shifts in the real world. However, deep\nneural networks (DNNs) are easily biased towards the training set, which causes\nsevere performance degradation when they receive out-of-distribution data. Many\nmethods are studied to train models that generalize under various distribution\nshifts in the literature of domain generalization (DG). However, the recent\nDomainBed and WILDS benchmarks challenged the effectiveness of these methods.\nAiming at the problems in the existing research, we propose a new domain\ngeneralization task for handwritten Chinese character recognition (HCCR) to\nenrich the application scenarios of DG method research. We evaluate eighteen DG\nmethods on the proposed PaHCC (Printed and Handwritten Chinese Characters)\ndataset and show that the performance of existing methods on this dataset is\nstill unsatisfactory. Besides, under a designed dynamic DG setting, we reveal\nmore properties of DG methods and argue that only the leave-one-domain-out\nprotocol is unreliable. We advocate that researchers in the DG community refer\nto dynamic performance of methods for more comprehensive and reliable\nevaluation. Our dataset and evaluations bring new perspectives to the community\nfor more substantial progress. We will make our dataset public with the article\npublished to facilitate the study of domain generalization.\n","authors":["Jiao Zhang","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06135v1","updated":"2023-09-12T11:19:36Z","published":"2023-09-12T11:19:36Z","title":"Prompting4Debugging: Red-Teaming Text-to-Image Diffusion Models by\n Finding Problematic Prompts","summary":" Text-to-image diffusion models, e.g. Stable Diffusion (SD), lately have shown\nremarkable ability in high-quality content generation, and become one of the\nrepresentatives for the recent wave of transformative AI. Nevertheless, such\nadvance comes with an intensifying concern about the misuse of this generative\ntechnology, especially for producing copyrighted or NSFW (i.e. not safe for\nwork) images. Although efforts have been made to filter inappropriate\nimages/prompts or remove undesirable concepts/styles via model fine-tuning, the\nreliability of these safety mechanisms against diversified problematic prompts\nremains largely unexplored. In this work, we propose Prompting4Debugging (P4D)\nas a debugging and red-teaming tool that automatically finds problematic\nprompts for diffusion models to test the reliability of a deployed safety\nmechanism. We demonstrate the efficacy of our P4D tool in uncovering new\nvulnerabilities of SD models with safety mechanisms. Particularly, our result\nshows that around half of prompts in existing safe prompting benchmarks which\nwere originally considered \"safe\" can actually be manipulated to bypass many\ndeployed safety mechanisms, including concept removal, negative prompt, and\nsafety guidance. Our findings suggest that, without comprehensive testing, the\nevaluations on limited safe prompting benchmarks can lead to a false sense of\nsafety for text-to-image models.\n","authors":["Zhi-Yi Chin","Chieh-Ming Jiang","Ching-Chun Huang","Pin-Yu Chen","Wei-Chen Chiu"],"pdf_url":"https://arxiv.org/pdf/2309.06135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06130v1","updated":"2023-09-12T11:17:25Z","published":"2023-09-12T11:17:25Z","title":"JOADAA: joint online action detection and action anticipation","summary":" Action anticipation involves forecasting future actions by connecting past\nevents to future ones. However, this reasoning ignores the real-life hierarchy\nof events which is considered to be composed of three main parts: past,\npresent, and future. We argue that considering these three main parts and their\ndependencies could improve performance. On the other hand, online action\ndetection is the task of predicting actions in a streaming manner. In this\ncase, one has access only to the past and present information. Therefore, in\nonline action detection (OAD) the existing approaches miss semantics or future\ninformation which limits their performance. To sum up, for both of these tasks,\nthe complete set of knowledge (past-present-future) is missing, which makes it\nchallenging to infer action dependencies, therefore having low performances. To\naddress this limitation, we propose to fuse both tasks into a single uniform\narchitecture. By combining action anticipation and online action detection, our\napproach can cover the missing dependencies of future information in online\naction detection. This method referred to as JOADAA, presents a uniform model\nthat jointly performs action anticipation and online action detection. We\nvalidate our proposed model on three challenging datasets: THUMOS'14, which is\na sparsely annotated dataset with one action per time step, CHARADES, and\nMulti-THUMOS, two densely annotated datasets with more complex scenarios.\nJOADAA achieves SOTA results on these benchmarks for both tasks.\n","authors":["Mohammed Guermal","Francois Bremond","Rui Dai","Abid Ali"],"pdf_url":"https://arxiv.org/pdf/2309.06130v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06129v1","updated":"2023-09-12T11:08:14Z","published":"2023-09-12T11:08:14Z","title":"LEyes: A Lightweight Framework for Deep Learning-Based Eye Tracking\n using Synthetic Eye Images","summary":" Deep learning has bolstered gaze estimation techniques, but real-world\ndeployment has been impeded by inadequate training datasets. This problem is\nexacerbated by both hardware-induced variations in eye images and inherent\nbiological differences across the recorded participants, leading to both\nfeature and pixel-level variance that hinders the generalizability of models\ntrained on specific datasets. While synthetic datasets can be a solution, their\ncreation is both time and resource-intensive. To address this problem, we\npresent a framework called Light Eyes or \"LEyes\" which, unlike conventional\nphotorealistic methods, only models key image features required for video-based\neye tracking using simple light distributions. LEyes facilitates easy\nconfiguration for training neural networks across diverse gaze-estimation\ntasks. We demonstrate that models trained using LEyes outperform other\nstate-of-the-art algorithms in terms of pupil and CR localization across\nwell-known datasets. In addition, a LEyes trained model outperforms the\nindustry standard eye tracker using significantly more cost-effective hardware.\nGoing forward, we are confident that LEyes will revolutionize synthetic data\ngeneration for gaze estimation models, and lead to significant improvements of\nthe next generation video-based eye trackers.\n","authors":["sean anthony byrne","virmarie maquiling","marcus nyström","enkelejda kasneci","diederick c. niehorster"],"pdf_url":"https://arxiv.org/pdf/2309.06129v1.pdf","comment":"31 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.06123v1","updated":"2023-09-12T10:47:37Z","published":"2023-09-12T10:47:37Z","title":"Dynamic Visual Prompt Tuning for Parameter Efficient Transfer Learning","summary":" Parameter efficient transfer learning (PETL) is an emerging research spot\nthat aims to adapt large-scale pre-trained models to downstream tasks. Recent\nadvances have achieved great success in saving storage and computation costs.\nHowever, these methods do not take into account instance-specific visual clues\nfor visual tasks. In this paper, we propose a Dynamic Visual Prompt Tuning\nframework (DVPT), which can generate a dynamic instance-wise token for each\nimage. In this way, it can capture the unique visual feature of each image,\nwhich can be more suitable for downstream visual tasks. We designed a Meta-Net\nmodule that can generate learnable prompts based on each image, thereby\ncapturing dynamic instance-wise visual features. Extensive experiments on a\nwide range of downstream recognition tasks show that DVPT achieves superior\nperformance than other PETL methods. More importantly, DVPT even outperforms\nfull fine-tuning on 17 out of 19 downstream tasks while maintaining high\nparameter efficiency. Our code will be released soon.\n","authors":["Chunqing Ruan","Hongjian Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06123v1.pdf","comment":"accepted by 2023 PRCV"},{"id":"http://arxiv.org/abs/2309.06118v1","updated":"2023-09-12T10:33:19Z","published":"2023-09-12T10:33:19Z","title":"C-RITNet: Set Infrared and Visible Image Fusion Free from Complementary\n Information Mining","summary":" Infrared and visible image fusion (IVIF) aims to extract and integrate the\ncomplementary information in two different modalities to generate high-quality\nfused images with salient targets and abundant texture details. However,\ncurrent image fusion methods go to great lengths to excavate complementary\nfeatures, which is generally achieved through two efforts. On the one hand, the\nfeature extraction network is expected to have excellent performance in\nextracting complementary information. On the other hand, complex fusion\nstrategies are often designed to aggregate the complementary information. In\nother words, enabling the network to perceive and extract complementary\ninformation is extremely challenging. Complicated fusion strategies, while\neffective, still run the risk of losing weak edge details. To this end, this\npaper rethinks the IVIF outside the box, proposing a complementary-redundant\ninformation transfer network (C-RITNet). It reasonably transfers complementary\ninformation into redundant one, which integrates both the shared and\ncomplementary features from two modalities. Hence, the proposed method is able\nto alleviate the challenges posed by the complementary information extraction\nand reduce the reliance on sophisticated fusion strategies. Specifically, to\nskillfully sidestep aggregating complementary information in IVIF, we first\ndesign the mutual information transfer (MIT) module to mutually represent\nfeatures from two modalities, roughly transferring complementary information\ninto redundant one. Then, a redundant information acquisition supervised by\nsource image (RIASSI) module is devised to further ensure the\ncomplementary-redundant information transfer after MIT. Meanwhile, we also\npropose a structure information preservation (SIP) module to guarantee that the\nedge structure information of the source images can be transferred to the\nfusion results.\n","authors":["Yafei Zhang","Keying Du","Huafeng Li","Zhengtao Yu","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06107v1","updated":"2023-09-12T10:20:38Z","published":"2023-09-12T10:20:38Z","title":"HOC-Search: Efficient CAD Model and Pose Retrieval from RGB-D Scans","summary":" We present an automated and efficient approach for retrieving high-quality\nCAD models of objects and their poses in a scene captured by a moving RGB-D\ncamera. We first investigate various objective functions to measure similarity\nbetween a candidate CAD object model and the available data, and the best\nobjective function appears to be a \"render-and-compare\" method comparing depth\nand mask rendering. We thus introduce a fast-search method that approximates an\nexhaustive search based on this objective function for simultaneously\nretrieving the object category, a CAD model, and the pose of an object given an\napproximate 3D bounding box. This method involves a search tree that organizes\nthe CAD models and object properties including object category and pose for\nfast retrieval and an algorithm inspired by Monte Carlo Tree Search, that\nefficiently searches this tree. We show that this method retrieves CAD models\nthat fit the real objects very well, with a speed-up factor of 10x to 120x\ncompared to exhaustive search.\n","authors":["Stefan Ainetter","Sinisa Stekovic","Friedrich Fraundorfer","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2309.06107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06105v1","updated":"2023-09-12T10:17:28Z","published":"2023-09-12T10:17:28Z","title":"Towards Visual Taxonomy Expansion","summary":" Taxonomy expansion task is essential in organizing the ever-increasing volume\nof new concepts into existing taxonomies. Most existing methods focus\nexclusively on using textual semantics, leading to an inability to generalize\nto unseen terms and the \"Prototypical Hypernym Problem.\" In this paper, we\npropose Visual Taxonomy Expansion (VTE), introducing visual features into the\ntaxonomy expansion task. We propose a textual hypernymy learning task and a\nvisual prototype learning task to cluster textual and visual semantics. In\naddition to the tasks on respective modalities, we introduce a hyper-proto\nconstraint that integrates textual and visual semantics to produce fine-grained\nvisual semantics. Our method is evaluated on two datasets, where we obtain\ncompelling results. Specifically, on the Chinese taxonomy dataset, our method\nsignificantly improves accuracy by 8.75 %. Additionally, our approach performs\nbetter than ChatGPT on the Chinese taxonomy dataset.\n","authors":["Tinghui Zhu","Jingping Liu","Jiaqing Liang","Haiyun Jiang","Yanghua Xiao","Zongyu Wang","Rui Xie","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2309.06105v1.pdf","comment":"ACMMM accepted paper"},{"id":"http://arxiv.org/abs/2309.06102v1","updated":"2023-09-12T10:08:33Z","published":"2023-09-12T10:08:33Z","title":"Can we predict the Most Replayed data of video streaming platforms?","summary":" Predicting which specific parts of a video users will replay is important for\nseveral applications, including targeted advertisement placement on video\nplatforms and assisting video creators. In this work, we explore whether it is\npossible to predict the Most Replayed (MR) data from YouTube videos. To this\nend, we curate a large video benchmark, the YTMR500 dataset, which comprises\n500 YouTube videos with MR data annotations. We evaluate Deep Learning (DL)\nmodels of varying complexity on our dataset and perform an extensive ablation\nstudy. In addition, we conduct a user study to estimate the human performance\non MR data prediction. Our results show that, although by a narrow margin, all\nthe evaluated DL models outperform random predictions. Additionally, they\nexceed human-level accuracy. This suggests that predicting the MR data is a\ndifficult task that can be enhanced through the assistance of DL. Finally, we\nbelieve that DL performance on MR data prediction can be further improved, for\nexample, by using multi-modal learning. We encourage the research community to\nuse our benchmark dataset to further investigate automatic MR data prediction.\n","authors":["Alessandro Duico","Ombretta Strafforello","Jan van Gemert"],"pdf_url":"https://arxiv.org/pdf/2309.06102v1.pdf","comment":"Accepted Extended Abstract at ICCV 2023 Workshop on AI for Creative\n Video Editing and Understanding"},{"id":"http://arxiv.org/abs/2309.06095v1","updated":"2023-09-12T10:00:23Z","published":"2023-09-12T10:00:23Z","title":"Estimating exercise-induced fatigue from thermal facial images","summary":" Exercise-induced fatigue resulting from physical activity can be an early\nindicator of overtraining, illness, or other health issues. In this article, we\npresent an automated method for estimating exercise-induced fatigue levels\nthrough the use of thermal imaging and facial analysis techniques utilizing\ndeep learning models. Leveraging a novel dataset comprising over 400,000\nthermal facial images of rested and fatigued users, our results suggest that\nexercise-induced fatigue levels could be predicted with only one static thermal\nframe with an average error smaller than 15\\%. The results emphasize the\nviability of using thermal imaging in conjunction with deep learning for\nreliable exercise-induced fatigue estimation.\n","authors":["Manuel Lage Cañellas","Constantino Álvarez Casado","Le Nguyen","Miguel Bordallo López"],"pdf_url":"https://arxiv.org/pdf/2309.06095v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2308.10161v3","updated":"2023-09-12T09:45:02Z","published":"2023-08-20T04:34:30Z","title":"ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under\n Challenging Conditions","summary":" Robust 3D object detection in extreme weather and illumination conditions is\na challenging task. While radars and thermal cameras are known for their\nresilience to these conditions, few studies have been conducted on\nradar-thermal fusion due to the lack of corresponding datasets. To address this\ngap, we first present a new multi-modal dataset called ThermRad, which includes\na 3D LiDAR, a 4D radar, an RGB camera and a thermal camera. This dataset is\nunique because it includes data from all four sensors in extreme weather\nconditions, providing a valuable resource for future research in this area. To\nvalidate the robustness of 4D radars and thermal cameras for 3D object\ndetection in challenging weather conditions, we propose a new multi-modal\nfusion method called RTDF-RCNN, which leverages the complementary strengths of\n4D radars and thermal cameras to boost object detection performance. To further\nprove the effectiveness of our proposed framework, we re-implement\nstate-of-the-art (SOTA) 3D detectors on our dataset as benchmarks for\nevaluation. Our method achieves significant enhancements in detecting cars,\npedestrians, and cyclists, with improvements of over 7.98%, 24.27%, and 27.15%,\nrespectively, while achieving comparable results to LiDAR-based approaches. Our\ncontributions in both the ThermRad dataset and the new multi-modal fusion\nmethod provide a new approach to robust 3D object detection in adverse weather\nand illumination conditions. The ThermRad dataset will be released.\n","authors":["Qiao Yan","Yihan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.10161v3.pdf","comment":"At this time, we have not reached a definitive agreement regarding\n the ownership and copyright of this dataset. Due to the unresolved issue\n regarding the dataset, I am writing to formally request the withdrawal of our\n paper"},{"id":"http://arxiv.org/abs/2309.02185v3","updated":"2023-09-12T09:38:38Z","published":"2023-09-05T12:42:26Z","title":"BEVTrack: A Simple Baseline for 3D Single Object Tracking in Bird's-Eye\n View","summary":" 3D single object tracking (SOT) in point clouds is still a challenging\nproblem due to appearance variation, distractors, and high sparsity of point\nclouds. Notably, in autonomous driving scenarios, the target object typically\nmaintains spatial adjacency across consecutive frames, predominantly moving\nhorizontally. This spatial continuity offers valuable prior knowledge for\ntarget localization. However, existing trackers, which often employ point-wise\nrepresentations, struggle to efficiently utilize this knowledge owing to the\nirregular format of such representations. Consequently, they require elaborate\ndesigns and solving multiple subtasks to establish spatial correspondence. In\nthis paper, we introduce BEVTrack, a simple yet strong baseline framework for\n3D SOT. After converting consecutive point clouds into the common Bird's-Eye\nView representation, BEVTrack inherently encodes spatial proximity and adeptly\ncaptures motion cues for tracking via a simple element-wise operation and\nconvolutional layers. Additionally, to better deal with objects having diverse\nsizes and moving patterns, BEVTrack directly learns the underlying motion\ndistribution rather than making a fixed Laplacian or Gaussian assumption as in\nprevious works. Without bells and whistles, BEVTrack achieves state-of-the-art\nperformance on KITTI and NuScenes datasets while maintaining a high inference\nspeed of 122 FPS. The code will be released at\nhttps://github.com/xmm-prio/BEVTrack.\n","authors":["Yuxiang Yang","Yingqi Deng","Jiahao Nie","Jing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.02185v3.pdf","comment":"Technical report. Work in progress. Typo correction. The code will be\n released at https://github.com/xmm-prio/BEVTrack"},{"id":"http://arxiv.org/abs/2308.16139v3","updated":"2023-09-12T09:37:47Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Viet Duc Vu","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrick Ferdinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Küstner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Löffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Arcot Sowmya","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Florian Dubost","Stefanie Schreiber","Hendrik Mattern","Oliver Speck","Daniel Haehn","Christoph John","Andreas Nürnberger","João Pedrosa","Carlos Ferreira","Guilherme Aresta","António Cunha","Aurélio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Röhrig","Frank Hölzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","André Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hörst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dženan Zukić","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ti-chiun Chang","Seyed-Ahmad Ahmadi","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2309.06086v1","updated":"2023-09-12T09:31:34Z","published":"2023-09-12T09:31:34Z","title":"Plasticity-Optimized Complementary Networks for Unsupervised Continual\n Learning","summary":" Continuous unsupervised representation learning (CURL) research has greatly\nbenefited from improvements in self-supervised learning (SSL) techniques. As a\nresult, existing CURL methods using SSL can learn high-quality representations\nwithout any labels, but with a notable performance drop when learning on a\nmany-tasks data stream. We hypothesize that this is caused by the\nregularization losses that are imposed to prevent forgetting, leading to a\nsuboptimal plasticity-stability trade-off: they either do not adapt fully to\nthe incoming data (low plasticity), or incur significant forgetting when\nallowed to fully adapt to a new SSL pretext-task (low stability). In this work,\nwe propose to train an expert network that is relieved of the duty of keeping\nthe previous knowledge and can focus on performing optimally on the new tasks\n(optimizing plasticity). In the second phase, we combine this new knowledge\nwith the previous network in an adaptation-retrospection phase to avoid\nforgetting and initialize a new expert with the knowledge of the old network.\nWe perform several experiments showing that our proposed approach outperforms\nother CURL exemplar-free methods in few- and many-task split settings.\nFurthermore, we show how to adapt our approach to semi-supervised continual\nlearning (Semi-SCL) and show that we surpass the accuracy of other\nexemplar-free Semi-SCL methods and reach the results of some others that use\nexemplars.\n","authors":["Alex Gomez-Villa","Bartlomiej Twardowski","Kai Wang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2309.06086v1.pdf","comment":"Accepted at WACV2024"},{"id":"http://arxiv.org/abs/2212.10229v4","updated":"2023-09-12T09:23:39Z","published":"2022-12-20T13:07:20Z","title":"StyleDomain: Efficient and Lightweight Parameterizations of StyleGAN for\n One-shot and Few-shot Domain Adaptation","summary":" Domain adaptation of GANs is a problem of fine-tuning GAN models pretrained\non a large dataset (e.g. StyleGAN) to a specific domain with few samples (e.g.\npainting faces, sketches, etc.). While there are many methods that tackle this\nproblem in different ways, there are still many important questions that remain\nunanswered. In this paper, we provide a systematic and in-depth analysis of the\ndomain adaptation problem of GANs, focusing on the StyleGAN model. We perform a\ndetailed exploration of the most important parts of StyleGAN that are\nresponsible for adapting the generator to a new domain depending on the\nsimilarity between the source and target domains. As a result of this study, we\npropose new efficient and lightweight parameterizations of StyleGAN for domain\nadaptation. Particularly, we show that there exist directions in StyleSpace\n(StyleDomain directions) that are sufficient for adapting to similar domains.\nFor dissimilar domains, we propose Affine+ and AffineLight+ parameterizations\nthat allows us to outperform existing baselines in few-shot adaptation while\nhaving significantly less training parameters. Finally, we examine StyleDomain\ndirections and discover their many surprising properties that we apply for\ndomain mixing and cross-domain image morphing. Source code can be found at\nhttps://github.com/AIRI-Institute/StyleDomain.\n","authors":["Aibek Alanov","Vadim Titov","Maksim Nakhodnov","Dmitry Vetrov"],"pdf_url":"https://arxiv.org/pdf/2212.10229v4.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13991v2","updated":"2023-09-12T09:22:04Z","published":"2023-08-27T02:59:59Z","title":"JL-lemma derived Optimal Projections for Discriminative Dictionary\n Learning","summary":" To overcome difficulties in classifying large dimensionality data with a\nlarge number of classes, we propose a novel approach called JLSPCADL. This\npaper uses the Johnson-Lindenstrauss (JL) Lemma to select the dimensionality of\na transformed space in which a discriminative dictionary can be learned for\nsignal classification. Rather than reducing dimensionality via random\nprojections, as is often done with JL, we use a projection transformation\nmatrix derived from Modified Supervised PC Analysis (M-SPCA) with the\nJL-prescribed dimension.\n JLSPCADL provides a heuristic to deduce suitable distortion levels and the\ncorresponding Suitable Description Length (SDL) of dictionary atoms to derive\nan optimal feature space and thus the SDL of dictionary atoms for better\nclassification. Unlike state-of-the-art dimensionality reduction-based\ndictionary learning methods, a projection transformation matrix derived in a\nsingle step from M-SPCA provides maximum feature-label consistency of the\ntransformed space while preserving the cluster structure of the original data.\nDespite confusing pairs, the dictionary for the transformed space generates\ndiscriminative sparse coefficients, with fewer training samples.\nExperimentation demonstrates that JLSPCADL scales well with an increasing\nnumber of classes and dimensionality. Improved label consistency of features\ndue to M-SPCA helps to classify better. Further, the complexity of training a\ndiscriminative dictionary is significantly reduced by using SDL.\nExperimentation on OCR and face recognition datasets shows relatively better\nclassification performance than other supervised dictionary learning\nalgorithms.\n","authors":["G. Madhuri","Atul Negi","Kaluri V. Rangarao"],"pdf_url":"https://arxiv.org/pdf/2308.13991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08568v2","updated":"2023-09-12T09:16:46Z","published":"2022-12-16T16:44:46Z","title":"Biomedical image analysis competitions: The state of current\n participation practice","summary":" The number of international benchmarking competitions is steadily increasing\nin various fields of machine learning (ML) research and practice. So far,\nhowever, little is known about the common practice as well as bottlenecks faced\nby the community in tackling the research questions posed. To shed light on the\nstatus quo of algorithm development in the specific field of biomedical imaging\nanalysis, we designed an international survey that was issued to all\nparticipants of challenges conducted in conjunction with the IEEE ISBI 2021 and\nMICCAI 2021 conferences (80 competitions in total). The survey covered\nparticipants' expertise and working environments, their chosen strategies, as\nwell as algorithm characteristics. A median of 72% challenge participants took\npart in the survey. According to our results, knowledge exchange was the\nprimary incentive (70%) for participation, while the reception of prize money\nplayed only a minor role (16%). While a median of 80 working hours was spent on\nmethod development, a large portion of participants stated that they did not\nhave enough time for method development (32%). 25% perceived the infrastructure\nto be a bottleneck. Overall, 94% of all solutions were deep learning-based. Of\nthese, 84% were based on standard architectures. 43% of the respondents\nreported that the data samples (e.g., images) were too large to be processed at\nonce. This was most commonly addressed by patch-based training (69%),\ndownsampling (37%), and solving 3D analysis tasks as a series of 2D tasks.\nK-fold cross-validation on the training set was performed by only 37% of the\nparticipants and only 50% of the participants performed ensembling based on\nmultiple identical models (61%) or heterogeneous models (39%). 48% of the\nrespondents applied postprocessing steps.\n","authors":["Matthias Eisenmann","Annika Reinke","Vivienn Weru","Minu Dietlinde Tizabi","Fabian Isensee","Tim J. Adler","Patrick Godau","Veronika Cheplygina","Michal Kozubek","Sharib Ali","Anubha Gupta","Jan Kybic","Alison Noble","Carlos Ortiz de Solórzano","Samiksha Pachade","Caroline Petitjean","Daniel Sage","Donglai Wei","Elizabeth Wilden","Deepak Alapatt","Vincent Andrearczyk","Ujjwal Baid","Spyridon Bakas","Niranjan Balu","Sophia Bano","Vivek Singh Bawa","Jorge Bernal","Sebastian Bodenstedt","Alessandro Casella","Jinwook Choi","Olivier Commowick","Marie Daum","Adrien Depeursinge","Reuben Dorent","Jan Egger","Hannah Eichhorn","Sandy Engelhardt","Melanie Ganz","Gabriel Girard","Lasse Hansen","Mattias Heinrich","Nicholas Heller","Alessa Hering","Arnaud Huaulmé","Hyunjeong Kim","Bennett Landman","Hongwei Bran Li","Jianning Li","Jun Ma","Anne Martel","Carlos Martín-Isla","Bjoern Menze","Chinedu Innocent Nwoye","Valentin Oreiller","Nicolas Padoy","Sarthak Pati","Kelly Payette","Carole Sudre","Kimberlin van Wijnen","Armine Vardazaryan","Tom Vercauteren","Martin Wagner","Chuanbo Wang","Moi Hoon Yap","Zeyun Yu","Chun Yuan","Maximilian Zenk","Aneeq Zia","David Zimmerer","Rina Bao","Chanyeol Choi","Andrew Cohen","Oleh Dzyubachyk","Adrian Galdran","Tianyuan Gan","Tianqi Guo","Pradyumna Gupta","Mahmood Haithami","Edward Ho","Ikbeom Jang","Zhili Li","Zhengbo Luo","Filip Lux","Sokratis Makrogiannis","Dominik Müller","Young-tack Oh","Subeen Pang","Constantin Pape","Gorkem Polat","Charlotte Rosalie Reed","Kanghyun Ryu","Tim Scherr","Vajira Thambawita","Haoyu Wang","Xinliang Wang","Kele Xu","Hung Yeh","Doyeob Yeo","Yixuan Yuan","Yan Zeng","Xin Zhao","Julian Abbing","Jannes Adam","Nagesh Adluru","Niklas Agethen","Salman Ahmed","Yasmina Al Khalil","Mireia Alenyà","Esa Alhoniemi","Chengyang An","Talha Anwar","Tewodros Weldebirhan Arega","Netanell Avisdris","Dogu Baran Aydogan","Yingbin Bai","Maria Baldeon Calisto","Berke Doga Basaran","Marcel Beetz","Cheng Bian","Hao Bian","Kevin Blansit","Louise Bloch","Robert Bohnsack","Sara Bosticardo","Jack Breen","Mikael Brudfors","Raphael Brüngel","Mariano Cabezas","Alberto Cacciola","Zhiwei Chen","Yucong Chen","Daniel Tianming Chen","Minjeong Cho","Min-Kook Choi","Chuantao Xie Chuantao Xie","Dana Cobzas","Julien Cohen-Adad","Jorge Corral Acero","Sujit Kumar Das","Marcela de Oliveira","Hanqiu Deng","Guiming Dong","Lars Doorenbos","Cory Efird","Sergio Escalera","Di Fan","Mehdi Fatan Serj","Alexandre Fenneteau","Lucas Fidon","Patryk Filipiak","René Finzel","Nuno R. Freitas","Christoph M. Friedrich","Mitchell Fulton","Finn Gaida","Francesco Galati","Christoforos Galazis","Chang Hee Gan","Zheyao Gao","Shengbo Gao","Matej Gazda","Beerend Gerats","Neil Getty","Adam Gibicar","Ryan Gifford","Sajan Gohil","Maria Grammatikopoulou","Daniel Grzech","Orhun Güley","Timo Günnemann","Chunxu Guo","Sylvain Guy","Heonjin Ha","Luyi Han","Il Song Han","Ali Hatamizadeh","Tian He","Jimin Heo","Sebastian Hitziger","SeulGi Hong","SeungBum Hong","Rian Huang","Ziyan Huang","Markus Huellebrand","Stephan Huschauer","Mustaffa Hussain","Tomoo Inubushi","Ece Isik Polat","Mojtaba Jafaritadi","SeongHun Jeong","Bailiang Jian","Yuanhong Jiang","Zhifan Jiang","Yueming Jin","Smriti Joshi","Abdolrahim Kadkhodamohammadi","Reda Abdellah Kamraoui","Inha Kang","Junghwa Kang","Davood Karimi","April Khademi","Muhammad Irfan Khan","Suleiman A. Khan","Rishab Khantwal","Kwang-Ju Kim","Timothy Kline","Satoshi Kondo","Elina Kontio","Adrian Krenzer","Artem Kroviakov","Hugo Kuijf","Satyadwyoom Kumar","Francesco La Rosa","Abhi Lad","Doohee Lee","Minho Lee","Chiara Lena","Hao Li","Ling Li","Xingyu Li","Fuyuan Liao","KuanLun Liao","Arlindo Limede Oliveira","Chaonan Lin","Shan Lin","Akis Linardos","Marius George Linguraru","Han Liu","Tao Liu","Di Liu","Yanling Liu","João Lourenço-Silva","Jingpei Lu","Jiangshan Lu","Imanol Luengo","Christina B. Lund","Huan Minh Luu","Yi Lv","Yi Lv","Uzay Macar","Leon Maechler","Sina Mansour L.","Kenji Marshall","Moona Mazher","Richard McKinley","Alfonso Medela","Felix Meissen","Mingyuan Meng","Dylan Miller","Seyed Hossein Mirjahanmardi","Arnab Mishra","Samir Mitha","Hassan Mohy-ud-Din","Tony Chi Wing Mok","Gowtham Krishnan Murugesan","Enamundram Naga Karthik","Sahil Nalawade","Jakub Nalepa","Mohamed Naser","Ramin Nateghi","Hammad Naveed","Quang-Minh Nguyen","Cuong Nguyen Quoc","Brennan Nichyporuk","Bruno Oliveira","David Owen","Jimut Bahan Pal","Junwen Pan","Wentao Pan","Winnie Pang","Bogyu Park","Vivek Pawar","Kamlesh Pawar","Michael Peven","Lena Philipp","Tomasz Pieciak","Szymon Plotka","Marcel Plutat","Fattaneh Pourakpour","Domen Preložnik","Kumaradevan Punithakumar","Abdul Qayyum","Sandro Queirós","Arman Rahmim","Salar Razavi","Jintao Ren","Mina Rezaei","Jonathan Adam Rico","ZunHyan Rieu","Markus Rink","Johannes Roth","Yusely Ruiz-Gonzalez","Numan Saeed","Anindo Saha","Mostafa Salem","Ricardo Sanchez-Matilla","Kurt Schilling","Wei Shao","Zhiqiang Shen","Ruize Shi","Pengcheng Shi","Daniel Sobotka","Théodore Soulier","Bella Specktor Fadida","Danail Stoyanov","Timothy Sum Hon Mun","Xiaowu Sun","Rong Tao","Franz Thaler","Antoine Théberge","Felix Thielke","Helena Torres","Kareem A. Wahid","Jiacheng Wang","YiFei Wang","Wei Wang","Xiong Wang","Jianhui Wen","Ning Wen","Marek Wodzinski","Ye Wu","Fangfang Xia","Tianqi Xiang","Chen Xiaofei","Lizhan Xu","Tingting Xue","Yuxuan Yang","Lin Yang","Kai Yao","Huifeng Yao","Amirsaeed Yazdani","Michael Yip","Hwanseung Yoo","Fereshteh Yousefirizi","Shunkai Yu","Lei Yu","Jonathan Zamora","Ramy Ashraf Zeineldin","Dewen Zeng","Jianpeng Zhang","Bokai Zhang","Jiapeng Zhang","Fan Zhang","Huahong Zhang","Zhongchen Zhao","Zixuan Zhao","Jiachen Zhao","Can Zhao","Qingshuo Zheng","Yuheng Zhi","Ziqi Zhou","Baosheng Zou","Klaus Maier-Hein","Paul F. Jäger","Annette Kopp-Schneider","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2212.08568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06075v1","updated":"2023-09-12T09:12:37Z","published":"2023-09-12T09:12:37Z","title":"A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel\n Segmentation via Two-Phase Training Angiography-to-Venography Translation","summary":" We present a semi-supervised domain adaptation framework for brain vessel\nsegmentation from different image modalities. Existing state-of-the-art methods\nfocus on a single modality, despite the wide range of available cerebrovascular\nimaging techniques. This can lead to significant distribution shifts that\nnegatively impact the generalization across modalities. By relying on annotated\nangiographies and a limited number of annotated venographies, our framework\naccomplishes image-to-image translation and semantic segmentation, leveraging a\ndisentangled and semantically rich latent space to represent heterogeneous data\nand perform image-level adaptation from source to target domains. Moreover, we\nreduce the typical complexity of cycle-based architectures and minimize the use\nof adversarial training, which allows us to build an efficient and intuitive\nmodel with stable training. We evaluate our method on magnetic resonance\nangiographies and venographies. While achieving state-of-the-art performance in\nthe source domain, our method attains a Dice score coefficient in the target\ndomain that is only 8.9% lower, highlighting its promising potential for robust\ncerebrovascular image segmentation across different modalities.\n","authors":["Francesco Galati","Daniele Falcetta","Rosa Cortese","Barbara Casolla","Ferran Prados","Ninon Burgos","Maria A. Zuluaga"],"pdf_url":"https://arxiv.org/pdf/2309.06075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06067v1","updated":"2023-09-12T09:07:03Z","published":"2023-09-12T09:07:03Z","title":"Batch Implicit Neural Representation for MRI Parallel Reconstruction","summary":" Magnetic resonance imaging (MRI) always suffered from the problem of long\nacquisition time. MRI reconstruction is one solution to reduce scan time by\nskipping certain phase-encoding lines and then restoring high-quality images\nfrom undersampled measurements. Recently, implicit neural representation (INR)\nhas emerged as a new deep learning method that represents an object as a\ncontinuous function of spatial coordinates, and this function is normally\nparameterized by a multilayer perceptron (MLP). In this paper, we propose a\nnovel MRI reconstruction method based on INR, which represents the\nfully-sampled images as the function of pixel coordinates and prior feature\nvectors of undersampled images for overcoming the generalization problem of\nINR. Specifically, we introduce a scale-embedded encoder to produce\nscale-independent pixel-specific features from MR images with different\nundersampled scales and then concatenate with coordinates vectors to recover\nfully-sampled MR images via an MLP, thus achieving arbitrary scale\nreconstruction. The performance of the proposed method was assessed by\nexperimenting on publicly available MRI datasets and compared with other\nreconstruction methods. Our quantitative evaluation demonstrates the\nsuperiority of the proposed method over alternative reconstruction methods.\n","authors":["Hao Li","Yusheng Zhou","Jianan Liu","Xiling Liu","Tao Huang","Zhihan Lv"],"pdf_url":"https://arxiv.org/pdf/2309.06067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03713v2","updated":"2023-09-12T09:01:31Z","published":"2023-05-05T17:54:34Z","title":"Avatar Fingerprinting for Authorized Use of Synthetic Talking-Head\n Videos","summary":" Modern generators render talking-head videos with impressive photorealism,\nushering in new user experiences such as videoconferencing under constrained\nbandwidth budgets. Their safe adoption, however, requires a mechanism to verify\nif the rendered video is trustworthy. For instance, for videoconferencing we\nmust identify cases in which a synthetic video portrait uses the appearance of\nan individual without their consent. We term this task avatar fingerprinting.\nSpecifically, we learn an embedding in which the motion signatures of one\nidentity are grouped together, and pushed away from those of the other\nidentities. This allows us to link the synthetic video to the identity driving\nthe expressions in the video, regardless of the facial appearance shown. Avatar\nfingerprinting algorithms will be critical as talking head generators become\nmore ubiquitous, and yet no large scale datasets exist for this new task.\nTherefore, we contribute a large dataset of people delivering scripted and\nimprovised short monologues, accompanied by synthetic videos in which we render\nvideos of one person using the facial appearance of another. Project page:\nhttps://research.nvidia.com/labs/nxp/avatar-fingerprinting/.\n","authors":["Ekta Prashnani","Koki Nagano","Shalini De Mello","David Luebke","Orazio Gallo"],"pdf_url":"https://arxiv.org/pdf/2305.03713v2.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.06062v1","updated":"2023-09-12T09:00:17Z","published":"2023-09-12T09:00:17Z","title":"Selection of contributing factors for predicting landslide\n susceptibility using machine learning and deep learning models","summary":" Landslides are a common natural disaster that can cause casualties, property\nsafety threats and economic losses. Therefore, it is important to understand or\npredict the probability of landslide occurrence at potentially risky sites. A\ncommonly used means is to carry out a landslide susceptibility assessment based\non a landslide inventory and a set of landslide contributing factors. This can\nbe readily achieved using machine learning (ML) models such as logistic\nregression (LR), support vector machine (SVM), random forest (RF), extreme\ngradient boosting (Xgboost), or deep learning (DL) models such as convolutional\nneural network (CNN) and long short time memory (LSTM). As the input data for\nthese models, landslide contributing factors have varying influences on\nlandslide occurrence. Therefore, it is logically feasible to select more\nimportant contributing factors and eliminate less relevant ones, with the aim\nof increasing the prediction accuracy of these models. However, selecting more\nimportant factors is still a challenging task and there is no generally\naccepted method. Furthermore, the effects of factor selection using various\nmethods on the prediction accuracy of ML and DL models are unclear. In this\nstudy, the impact of the selection of contributing factors on the accuracy of\nlandslide susceptibility predictions using ML and DL models was investigated.\nFour methods for selecting contributing factors were considered for all the\naforementioned ML and DL models, which included Information Gain Ratio (IGR),\nRecursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least\nAbsolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization\n(HHO). In addition, autoencoder-based factor selection methods for DL models\nwere also investigated. To assess their performances, an exhaustive approach\nwas adopted,...\n","authors":["Cheng Chen","Lei Fan"],"pdf_url":"https://arxiv.org/pdf/2309.06062v1.pdf","comment":"Stochastic Environmental Research and Risk Assessment"},{"id":"http://arxiv.org/abs/2309.06054v1","updated":"2023-09-12T08:45:25Z","published":"2023-09-12T08:45:25Z","title":"How does representation impact in-context learning: A exploration on a\n synthetic task","summary":" In-context learning, i.e., learning from in-context samples, is an impressive\nability of Transformer. However, the mechanism driving the in-context learning\nis not yet fully understood. In this study, we aim to investigate from an\nunderexplored perspective of representation learning. The representation is\nmore complex for in-context learning senario, where the representation can be\nimpacted by both model weights and in-context samples. We refer the above two\nconceptually aspects of representation as in-weight component and in-context\ncomponent, respectively. To study how the two components affect in-context\nlearning capabilities, we construct a novel synthetic task, making it possible\nto device two probes, in-weights probe and in-context probe, to evaluate the\ntwo components, respectively. We demonstrate that the goodness of in-context\ncomponent is highly related to the in-context learning performance, which\nindicates the entanglement between in-context learning and representation\nlearning. Furthermore, we find that a good in-weights component can actually\nbenefit the learning of the in-context component, indicating that in-weights\nlearning should be the foundation of in-context learning. To further understand\nthe the in-context learning mechanism and importance of the in-weights\ncomponent, we proof by construction that a simple Transformer, which uses\npattern matching and copy-past mechanism to perform in-context learning, can\nmatch the in-context learning performance with more complex, best tuned\nTransformer under the perfect in-weights component assumption. In short, those\ndiscoveries from representation learning perspective shed light on new\napproaches to improve the in-context capacity.\n","authors":["Jingwen Fu","Tao Yang","Yuwang Wang","Yan Lu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.06054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11965v2","updated":"2023-09-12T08:39:24Z","published":"2023-07-22T02:50:07Z","title":"An Intelligent Remote Sensing Image Quality Inspection System","summary":" Due to the inevitable presence of quality problems, remote sensing image\nquality inspection is indeed an indispensable step between the acquisition and\nthe application of remote sensing images. However, traditional manual\ninspection suffers from low efficiency. Hence, we propose a novel deep\nlearning-based two-step intelligent system consisting of multiple advanced\ncomputer vision models, which first performs image classification and then\naccordingly adopts the most appropriate method, such as semantic segmentation,\nto localize the quality problems. Results demonstrate that the proposed method\nexhibits excellent performance and efficiency, surpassing traditional methods.\nFurthermore, we conduct an initial exploration of applying multimodal models to\nremote sensing image quality inspection.\n","authors":["Yijiong Yu","Tao Wang","Kang Ran","Chang Li","Hao Wu"],"pdf_url":"https://arxiv.org/pdf/2307.11965v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06047v1","updated":"2023-09-12T08:30:48Z","published":"2023-09-12T08:30:48Z","title":"Real-Time Semantic Segmentation: A Brief Survey & Comparative Study in\n Remote Sensing","summary":" Real-time semantic segmentation of remote sensing imagery is a challenging\ntask that requires a trade-off between effectiveness and efficiency. It has\nmany applications including tracking forest fires, detecting changes in land\nuse and land cover, crop health monitoring, and so on. With the success of\nefficient deep learning methods (i.e., efficient deep neural networks) for\nreal-time semantic segmentation in computer vision, researchers have adopted\nthese efficient deep neural networks in remote sensing image analysis. This\npaper begins with a summary of the fundamental compression methods for\ndesigning efficient deep neural networks and provides a brief but comprehensive\nsurvey, outlining the recent developments in real-time semantic segmentation of\nremote sensing imagery. We examine several seminal efficient deep learning\nmethods, placing them in a taxonomy based on the network architecture design\napproach. Furthermore, we evaluate the quality and efficiency of some existing\nefficient deep neural networks on a publicly available remote sensing semantic\nsegmentation benchmark dataset, the OpenEarthMap. The experimental results of\nan extensive comparative study demonstrate that most of the existing efficient\ndeep neural networks have good segmentation quality, but they suffer low\ninference speed (i.e., high latency rate), which may limit their capability of\ndeployment in real-time applications of remote sensing image segmentation. We\nprovide some insights into the current trend and future research directions for\nreal-time semantic segmentation of remote sensing imagery.\n","authors":["Clifford Broni-Bediako","Junshi Xia","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2309.06047v1.pdf","comment":"Submitted to IEEE GRSM"},{"id":"http://arxiv.org/abs/2309.06046v1","updated":"2023-09-12T08:30:35Z","published":"2023-09-12T08:30:35Z","title":"BatMan-CLR: Making Few-shots Meta-Learners Resilient Against Label Noise","summary":" The negative impact of label noise is well studied in classical supervised\nlearning yet remains an open research question in meta-learning. Meta-learners\naim to adapt to unseen learning tasks by learning a good initial model in\nmeta-training and consecutively fine-tuning it according to new tasks during\nmeta-testing. In this paper, we present the first extensive analysis of the\nimpact of varying levels of label noise on the performance of state-of-the-art\nmeta-learners, specifically gradient-based $N$-way $K$-shot learners. We show\nthat the accuracy of Reptile, iMAML, and foMAML drops by up to 42% on the\nOmniglot and CifarFS datasets when meta-training is affected by label noise. To\nstrengthen the resilience against label noise, we propose two sampling\ntechniques, namely manifold (Man) and batch manifold (BatMan), which transform\nthe noisy supervised learners into semi-supervised ones to increase the utility\nof noisy labels. We first construct manifold samples of $N$-way\n$2$-contrastive-shot tasks through augmentation, learning the embedding via a\ncontrastive loss in meta-training, and then perform classification through\nzeroing on the embedding in meta-testing. We show that our approach can\neffectively mitigate the impact of meta-training label noise. Even with 60%\nwrong labels \\batman and \\man can limit the meta-testing accuracy drop to\n${2.5}$, ${9.4}$, ${1.1}$ percent points, respectively, with existing\nmeta-learners across the Omniglot, CifarFS, and MiniImagenet datasets.\n","authors":["Jeroen M. Galjaard","Robert Birke","Juan Perez","Lydia Y. Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06046v1.pdf","comment":"10 pages,3 figures"},{"id":"http://arxiv.org/abs/2309.06030v1","updated":"2023-09-12T08:04:56Z","published":"2023-09-12T08:04:56Z","title":"Federated Learning for Large-Scale Scene Modeling with Neural Radiance\n Fields","summary":" We envision a system to continuously build and maintain a map based on\nearth-scale neural radiance fields (NeRF) using data collected from vehicles\nand drones in a lifelong learning manner. However, existing large-scale\nmodeling by NeRF has problems in terms of scalability and maintainability when\nmodeling earth-scale environments. Therefore, to address these problems, we\npropose a federated learning pipeline for large-scale modeling with NeRF. We\ntailor the model aggregation pipeline in federated learning for NeRF, thereby\nallowing local updates of NeRF. In the aggregation step, the accuracy of the\nclients' global pose is critical. Thus, we also propose global pose alignment\nto align the noisy global pose of clients before the aggregation step. In\nexperiments, we show the effectiveness of the proposed pose alignment and the\nfederated learning pipeline on the large-scale scene dataset, Mill19.\n","authors":["Teppei Suzuki"],"pdf_url":"https://arxiv.org/pdf/2309.06030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06027v1","updated":"2023-09-12T07:56:55Z","published":"2023-09-12T07:56:55Z","title":"A new meteor detection application robust to camera movements","summary":" This article presents a new tool for the automatic detection of meteors. Fast\nMeteor Detection Toolbox (FMDT) is able to detect meteor sightings by analyzing\nvideos acquired by cameras onboard weather balloons or within airplane with\nstabilization. The challenge consists in designing a processing chain composed\nof simple algorithms, that are robust to the high fluctuation of the videos and\nthat satisfy the constraints on power consumption (10 W) and real-time\nprocessing (25 frames per second).\n","authors":["Clara Ciocan","Mathuran Kandeepan","Adrien Cassagne","Jeremie Vaubaillon","Fabian Zander","Lionel Lacassagne"],"pdf_url":"https://arxiv.org/pdf/2309.06027v1.pdf","comment":"in French language, Groupe de Recherche et d'{\\'E}tudes de Traitement\n du Signal et des Images (GRETSI), Aug 2023, Grenoble, France"},{"id":"http://arxiv.org/abs/2309.06023v1","updated":"2023-09-12T07:50:54Z","published":"2023-09-12T07:50:54Z","title":"Learning from History: Task-agnostic Model Contrastive Learning for\n Image Restoration","summary":" Contrastive learning has emerged as a prevailing paradigm for high-level\nvision tasks, which, by introducing properly negative samples, has also been\nexploited for low-level vision tasks to achieve a compact optimization space to\naccount for their ill-posed nature. However, existing methods rely on manually\npredefined, task-oriented negatives, which often exhibit pronounced\ntask-specific biases. In this paper, we propose a innovative approach for the\nadaptive generation of negative samples directly from the target model itself,\ncalled ``learning from history``. We introduce the Self-Prior guided Negative\nloss for image restoration (SPNIR) to enable this approach. Our approach is\ntask-agnostic and generic, making it compatible with any existing image\nrestoration method or task. We demonstrate the effectiveness of our approach by\nretraining existing models with SPNIR. The results show significant\nimprovements in image restoration across various tasks and architectures. For\nexample, models retrained with SPNIR outperform the original FFANet and\nDehazeFormer by 3.41 dB and 0.57 dB on the RESIDE indoor dataset for image\ndehazing. Similarly, they achieve notable improvements of 0.47 dB on SPA-Data\nover IDT for image deraining and 0.12 dB on Manga109 for a 4x scale\nsuper-resolution over lightweight SwinIR, respectively. Code and retrained\nmodels are available at\nhttps://github.com/Aitical/Task-agnostic_Model_Contrastive_Learning_Image_Restoration.\n","authors":["Gang Wu","Junjun Jiang","Kui Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06017v1","updated":"2023-09-12T07:31:51Z","published":"2023-09-12T07:31:51Z","title":"Feature Aggregation Network for Building Extraction from High-resolution\n Remote Sensing Images","summary":" The rapid advancement in high-resolution satellite remote sensing data\nacquisition, particularly those achieving submeter precision, has uncovered the\npotential for detailed extraction of surface architectural features. However,\nthe diversity and complexity of surface distributions frequently lead to\ncurrent methods focusing exclusively on localized information of surface\nfeatures. This often results in significant intraclass variability in boundary\nrecognition and between buildings. Therefore, the task of fine-grained\nextraction of surface features from high-resolution satellite imagery has\nemerged as a critical challenge in remote sensing image processing. In this\nwork, we propose the Feature Aggregation Network (FANet), concentrating on\nextracting both global and local features, thereby enabling the refined\nextraction of landmark buildings from high-resolution satellite remote sensing\nimagery. The Pyramid Vision Transformer captures these global features, which\nare subsequently refined by the Feature Aggregation Module and merged into a\ncohesive representation by the Difference Elimination Module. In addition, to\nensure a comprehensive feature map, we have incorporated the Receptive Field\nBlock and Dual Attention Module, expanding the receptive field and intensifying\nattention across spatial and channel dimensions. Extensive experiments on\nmultiple datasets have validated the outstanding capability of FANet in\nextracting features from high-resolution satellite images. This signifies a\nmajor breakthrough in the field of remote sensing image processing. We will\nrelease our code soon.\n","authors":["Xuan Zhou","Xuefeng Wei"],"pdf_url":"https://arxiv.org/pdf/2309.06017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09544v2","updated":"2023-09-12T07:25:49Z","published":"2023-08-18T13:22:59Z","title":"Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free\n Continual Learning","summary":" In this work, we investigate exemplar-free class incremental learning (CIL)\nwith knowledge distillation (KD) as a regularization strategy, aiming to\nprevent forgetting. KD-based methods are successfully used in CIL, but they\noften struggle to regularize the model without access to exemplars of the\ntraining data from previous tasks. Our analysis reveals that this issue\noriginates from substantial representation shifts in the teacher network when\ndealing with out-of-distribution data. This causes large errors in the KD loss\ncomponent, leading to performance degradation in CIL models. Inspired by recent\ntest-time adaptation methods, we introduce Teacher Adaptation (TA), a method\nthat concurrently updates the teacher and the main models during incremental\ntraining. Our method seamlessly integrates with KD-based CIL approaches and\nallows for consistent enhancement of their performance across multiple\nexemplar-free CIL benchmarks.\n","authors":["Filip Szatkowski","Mateusz Pyla","Marcin Przewięźlikowski","Sebastian Cygert","Bartłomiej Twardowski","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2308.09544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10705v4","updated":"2023-09-12T07:21:05Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n Lane Segmentation in Self-Driving Cars","summary":" Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v4.pdf","comment":"Accepted by MAPR 2023"},{"id":"http://arxiv.org/abs/2309.00434v2","updated":"2023-09-12T07:18:26Z","published":"2023-09-01T13:02:19Z","title":"Improving the matching of deformable objects by learning to detect\n keypoints","summary":" We propose a novel learned keypoint detection method to increase the number\nof correct matches for the task of non-rigid image correspondence. By\nleveraging true correspondences acquired by matching annotated image pairs with\na specified descriptor extractor, we train an end-to-end convolutional neural\nnetwork (CNN) to find keypoint locations that are more appropriate to the\nconsidered descriptor. For that, we apply geometric and photometric warpings to\nimages to generate a supervisory signal, allowing the optimization of the\ndetector. Experiments demonstrate that our method enhances the Mean Matching\nAccuracy of numerous descriptors when used in conjunction with our detection\nmethod, while outperforming the state-of-the-art keypoint detectors on real\nimages of non-rigid objects by 20 p.p. We also apply our method on the complex\nreal-world task of object retrieval where our detector performs on par with the\nfinest keypoint detectors currently available for this task. The source code\nand trained models are publicly available at\nhttps://github.com/verlab/LearningToDetect_PRL_2023\n","authors":["Felipe Cadar","Welerson Melo","Vaishnavi Kanagasabapathi","Guilherme Potje","Renato Martins","Erickson R. Nascimento"],"pdf_url":"https://arxiv.org/pdf/2309.00434v2.pdf","comment":"This is the accepted version of the paper to appear at Pattern\n Recognition Letters (PRL). The final journal version will be available at\n https://doi.org/10.1016/j.patrec.2023.08.012"},{"id":"http://arxiv.org/abs/2309.06006v1","updated":"2023-09-12T07:03:30Z","published":"2023-09-12T07:03:30Z","title":"SoccerNet 2023 Challenges Results","summary":" The SoccerNet 2023 challenges were the third annual video understanding\nchallenges organized by the SoccerNet team. For this third edition, the\nchallenges were composed of seven vision-based tasks split into three main\nthemes. The first theme, broadcast video understanding, is composed of three\nhigh-level tasks related to describing events occurring in the video\nbroadcasts: (1) action spotting, focusing on retrieving all timestamps related\nto global actions in soccer, (2) ball action spotting, focusing on retrieving\nall timestamps related to the soccer ball change of state, and (3) dense video\ncaptioning, focusing on describing the broadcast with natural language and\nanchored timestamps. The second theme, field understanding, relates to the\nsingle task of (4) camera calibration, focusing on retrieving the intrinsic and\nextrinsic camera parameters from images. The third and last theme, player\nunderstanding, is composed of three low-level tasks related to extracting\ninformation about the players: (5) re-identification, focusing on retrieving\nthe same players across multiple views, (6) multiple object tracking, focusing\non tracking players and the ball through unedited video streams, and (7) jersey\nnumber recognition, focusing on recognizing the jersey number of players from\ntracklets. Compared to the previous editions of the SoccerNet challenges, tasks\n(2-3-7) are novel, including new annotations and data, task (4) was enhanced\nwith more data and annotations, and task (6) now focuses on end-to-end\napproaches. More information on the tasks, challenges, and leaderboards are\navailable on https://www.soccer-net.org. Baselines and development kits can be\nfound on https://github.com/SoccerNet.\n","authors":["Anthony Cioppa","Silvio Giancola","Vladimir Somers","Floriane Magera","Xin Zhou","Hassan Mkhallati","Adrien Deliège","Jan Held","Carlos Hinojosa","Amir M. Mansourian","Pierre Miralles","Olivier Barnich","Christophe De Vleeschouwer","Alexandre Alahi","Bernard Ghanem","Marc Van Droogenbroeck","Abdullah Kamal","Adrien Maglo","Albert Clapés","Amr Abdelaziz","Artur Xarles","Astrid Orcesi","Atom Scott","Bin Liu","Byoungkwon Lim","Chen Chen","Fabian Deuser","Feng Yan","Fufu Yu","Gal Shitrit","Guanshuo Wang","Gyusik Choi","Hankyul Kim","Hao Guo","Hasby Fahrudin","Hidenari Koguchi","Håkan Ardö","Ibrahim Salah","Ido Yerushalmy","Iftikar Muhammad","Ikuma Uchida","Ishay Be'ery","Jaonary Rabarisoa","Jeongae Lee","Jiajun Fu","Jianqin Yin","Jinghang Xu","Jongho Nang","Julien Denize","Junjie Li","Junpei Zhang","Juntae Kim","Kamil Synowiec","Kenji Kobayashi","Kexin Zhang","Konrad Habel","Kota Nakajima","Licheng Jiao","Lin Ma","Lizhi Wang","Luping Wang","Menglong Li","Mengying Zhou","Mohamed Nasr","Mohamed Abdelwahed","Mykola Liashuha","Nikolay Falaleev","Norbert Oswald","Qiong Jia","Quoc-Cuong Pham","Ran Song","Romain Hérault","Rui Peng","Ruilong Chen","Ruixuan Liu","Ruslan Baikulov","Ryuto Fukushima","Sergio Escalera","Seungcheon Lee","Shimin Chen","Shouhong Ding","Taiga Someya","Thomas B. Moeslund","Tianjiao Li","Wei Shen","Wei Zhang","Wei Li","Wei Dai","Weixin Luo","Wending Zhao","Wenjie Zhang","Xinquan Yang","Yanbiao Ma","Yeeun Joo","Yingsen Zeng","Yiyang Gan","Yongqiang Zhu","Yujie Zhong","Zheng Ruan","Zhiheng Li","Zhijian Huang","Ziyu Meng"],"pdf_url":"https://arxiv.org/pdf/2309.06006v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06004v1","updated":"2023-09-12T07:02:13Z","published":"2023-09-12T07:02:13Z","title":"TSSAT: Two-Stage Statistics-Aware Transformation for Artistic Style\n Transfer","summary":" Artistic style transfer aims to create new artistic images by rendering a\ngiven photograph with the target artistic style. Existing methods learn styles\nsimply based on global statistics or local patches, lacking careful\nconsideration of the drawing process in practice. Consequently, the stylization\nresults either fail to capture abundant and diversified local style patterns,\nor contain undesired semantic information of the style image and deviate from\nthe global style distribution. To address this issue, we imitate the drawing\nprocess of humans and propose a Two-Stage Statistics-Aware Transformation\n(TSSAT) module, which first builds the global style foundation by aligning the\nglobal statistics of content and style features and then further enriches local\nstyle details by swapping the local statistics (instead of local features) in a\npatch-wise manner, significantly improving the stylization effects. Moreover,\nto further enhance both content and style representations, we introduce two\nnovel losses: an attention-based content loss and a patch-based style loss,\nwhere the former enables better content preservation by enforcing the semantic\nrelation in the content image to be retained during stylization, and the latter\nfocuses on increasing the local style similarity between the style and stylized\nimages. Extensive qualitative and quantitative experiments verify the\neffectiveness of our method.\n","authors":["Haibo Chen","Lei Zhao","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06004v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.05994v1","updated":"2023-09-12T06:49:56Z","published":"2023-09-12T06:49:56Z","title":"ATTA: Anomaly-aware Test-Time Adaptation for Out-of-Distribution\n Detection in Segmentation","summary":" Recent advancements in dense out-of-distribution (OOD) detection have\nprimarily focused on scenarios where the training and testing datasets share a\nsimilar domain, with the assumption that no domain shift exists between them.\nHowever, in real-world situations, domain shift often exits and significantly\naffects the accuracy of existing out-of-distribution (OOD) detection models. In\nthis work, we propose a dual-level OOD detection framework to handle domain\nshift and semantic shift jointly. The first level distinguishes whether domain\nshift exists in the image by leveraging global low-level features, while the\nsecond level identifies pixels with semantic shift by utilizing dense\nhigh-level feature maps. In this way, we can selectively adapt the model to\nunseen domains as well as enhance model's capacity in detecting novel classes.\nWe validate the efficacy of our proposed method on several OOD segmentation\nbenchmarks, including those with significant domain shifts and those without,\nobserving consistent performance improvements across various baseline models.\n","authors":["Zhitong Gao","Shipeng Yan","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2309.05994v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2309.05987v1","updated":"2023-09-12T06:32:42Z","published":"2023-09-12T06:32:42Z","title":"FLDNet: A Foreground-Aware Network for Polyp Segmentation Leveraging\n Long-Distance Dependencies","summary":" Given the close association between colorectal cancer and polyps, the\ndiagnosis and identification of colorectal polyps play a critical role in the\ndetection and surgical intervention of colorectal cancer. In this context, the\nautomatic detection and segmentation of polyps from various colonoscopy images\nhas emerged as a significant problem that has attracted broad attention.\nCurrent polyp segmentation techniques face several challenges: firstly, polyps\nvary in size, texture, color, and pattern; secondly, the boundaries between\npolyps and mucosa are usually blurred, existing studies have focused on\nlearning the local features of polyps while ignoring the long-range\ndependencies of the features, and also ignoring the local context and global\ncontextual information of the combined features. To address these challenges,\nwe propose FLDNet (Foreground-Long-Distance Network), a Transformer-based\nneural network that captures long-distance dependencies for accurate polyp\nsegmentation. Specifically, the proposed model consists of three main modules:\na pyramid-based Transformer encoder, a local context module, and a\nforeground-Aware module. Multilevel features with long-distance dependency\ninformation are first captured by the pyramid-based transformer encoder. On the\nhigh-level features, the local context module obtains the local characteristics\nrelated to the polyps by constructing different local context information. The\ncoarse map obtained by decoding the reconstructed highest-level features guides\nthe feature fusion process in the foreground-Aware module of the high-level\nfeatures to achieve foreground enhancement of the polyps. Our proposed method,\nFLDNet, was evaluated using seven metrics on common datasets and demonstrated\nsuperiority over state-of-the-art methods on widely-used evaluation measures.\n","authors":["Xuefeng Wei","Xuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.05987v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05972v1","updated":"2023-09-12T05:43:13Z","published":"2023-09-12T05:43:13Z","title":"Self-supervised Extraction of Human Motion Structures via Frame-wise\n Discrete Features","summary":" The present paper proposes an encoder-decoder model for extracting the\nstructures of human motions represented by frame-wise discrete features in a\nself-supervised manner. In the proposed method, features are extracted as codes\nin a motion codebook without the use of human knowledge, and the relationship\nbetween these codes can be visualized on a graph. Since the codes are expected\nto be temporally sparse compared to the captured frame rate and can be shared\nby multiple sequences, the proposed network model also addresses the need for\ntraining constraints. Specifically, the model consists of self-attention layers\nand a vector clustering block. The attention layers contribute to finding\nsparse keyframes and discrete features as motion codes, which are then\nextracted by vector clustering. The constraints are realized as training losses\nso that the same motion codes can be as contiguous as possible and can be\nshared by multiple sequences. In addition, we propose the use of causal\nself-attention as a method by which to calculate attention for long sequences\nconsisting of numerous frames. In our experiments, the sparse structures of\nmotion codes were used to compile a graph that facilitates visualization of the\nrelationship between the codes and the differences between sequences. We then\nevaluated the effectiveness of the extracted motion codes by applying them to\nmultiple recognition tasks and found that performance levels comparable to\ntask-optimized methods could be achieved by linear probing.\n","authors":["Tetsuya Abe","Ryusuke Sagawa","Ko Ayusawa","Wataru Takano"],"pdf_url":"https://arxiv.org/pdf/2309.05972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04570v2","updated":"2023-09-12T05:11:40Z","published":"2023-07-10T14:02:31Z","title":"A Call to Reflect on Evaluation Practices for Age Estimation:\n Comparative Analysis of the State-of-the-Art and a Unified Benchmark","summary":" Comparing different age estimation methods poses a challenge due to the\nunreliability of published results stemming from inconsistencies in the\nbenchmarking process. Previous studies have reported continuous performance\nimprovements over the past decade using specialized methods; however, our\nfindings challenge these claims. This paper identifies two trivial, yet\npersistent issues with the currently used evaluation protocol and describes how\nto resolve them. We describe our evaluation protocol in detail and provide\nspecific examples of how the protocol should be used. We utilize the protocol\nto offer an extensive comparative analysis for state-of-the-art facial age\nestimation methods. Surprisingly, we find that the performance differences\nbetween the methods are negligible compared to the effect of other factors,\nsuch as facial alignment, facial coverage, image resolution, model\narchitecture, or the amount of data used for pretraining. We use the gained\ninsights to propose using FaRL as the backbone model and demonstrate its\nefficiency. The results emphasize the importance of consistent data\npreprocessing practices for reliable and meaningful comparisons. We make our\nsource code public at\nhttps://github.com/paplhjak/Facial-Age-Estimation-Benchmark.\n","authors":["Jakub Paplham","Vojtech Franc"],"pdf_url":"https://arxiv.org/pdf/2307.04570v2.pdf","comment":"Revised version"},{"id":"http://arxiv.org/abs/2309.04914v2","updated":"2023-09-12T05:08:47Z","published":"2023-09-10T02:02:29Z","title":"MFPNet: Multi-scale Feature Propagation Network For Lightweight Semantic\n Segmentation","summary":" In contrast to the abundant research focusing on large-scale models, the\nprogress in lightweight semantic segmentation appears to be advancing at a\ncomparatively slower pace. However, existing compact methods often suffer from\nlimited feature representation capability due to the shallowness of their\nnetworks. In this paper, we propose a novel lightweight segmentation\narchitecture, called Multi-scale Feature Propagation Network (MFPNet), to\naddress the dilemma. Specifically, we design a robust Encoder-Decoder structure\nfeaturing symmetrical residual blocks that consist of flexible bottleneck\nresidual modules (BRMs) to explore deep and rich muti-scale semantic context.\nFurthermore, taking benefit from their capacity to model latent long-range\ncontextual relationships, we leverage Graph Convolutional Networks (GCNs) to\nfacilitate multi-scale feature propagation between the BRM blocks. When\nevaluated on benchmark datasets, our proposed approach shows superior\nsegmentation results.\n","authors":["Guoan Xu","Wenjing Jia","Tao Wu","Ligeng Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04914v2.pdf","comment":"5 pages, 3 figures, 5tables, conference"},{"id":"http://arxiv.org/abs/2309.01409v3","updated":"2023-09-12T05:05:26Z","published":"2023-09-04T07:40:30Z","title":"Implicit Neural Image Stitching With Enhanced and Blended Feature\n Reconstruction","summary":" Existing frameworks for image stitching often provide visually reasonable\nstitchings. However, they suffer from blurry artifacts and disparities in\nillumination, depth level, etc. Although the recent learning-based stitchings\nrelax such disparities, the required methods impose sacrifice of image\nqualities failing to capture high-frequency details for stitched images. To\naddress the problem, we propose a novel approach, implicit Neural Image\nStitching (NIS) that extends arbitrary-scale super-resolution. Our method\nestimates Fourier coefficients of images for quality-enhancing warps. Then, the\nsuggested model blends color mismatches and misalignment in the latent space\nand decodes the features into RGB values of stitched images. Our experiments\nshow that our approach achieves improvement in resolving the low-definition\nimaging of the previous deep image stitching with favorable accelerated\nimage-enhancing methods. Our source code is available at\nhttps://github.com/minshu-kim/NIS.\n","authors":["Minsu Kim","Jaewon Lee","Byeonghun Lee","Sunghoon Im","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01409v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05956v1","updated":"2023-09-12T04:41:45Z","published":"2023-09-12T04:41:45Z","title":"Beyond Generation: Harnessing Text to Image Models for Object Detection\n and Segmentation","summary":" We propose a new paradigm to automatically generate training data with\naccurate labels at scale using the text-to-image synthesis frameworks (e.g.,\nDALL-E, Stable Diffusion, etc.). The proposed approach1 decouples training data\ngeneration into foreground object generation, and contextually coherent\nbackground generation. To generate foreground objects, we employ a\nstraightforward textual template, incorporating the object class name as input\nprompts. This is fed into a text-to-image synthesis framework, producing\nvarious foreground images set against isolated backgrounds. A\nforeground-background segmentation algorithm is then used to generate\nforeground object masks. To generate context images, we begin by creating\nlanguage descriptions of the context. This is achieved by applying an image\ncaptioning method to a small set of images representing the desired context.\nThese textual descriptions are then transformed into a diverse array of context\nimages via a text-to-image synthesis framework. Subsequently, we composite\nthese with the foreground object masks produced in the initial step, utilizing\na cut-and-paste method, to formulate the training data. We demonstrate the\nadvantages of our approach on five object detection and segmentation datasets,\nincluding Pascal VOC and COCO. We found that detectors trained solely on\nsynthetic data produced by our method achieve performance comparable to those\ntrained on real data (Fig. 1). Moreover, a combination of real and synthetic\ndata yields even much better results. Further analysis indicates that the\nsynthetic data distribution complements the real data distribution effectively.\nAdditionally, we emphasize the compositional nature of our data generation\napproach in out-of-distribution and zero-shot data generation scenarios. We\nopen-source our code at https://github.com/gyhandy/Text2Image-for-Detection\n","authors":["Yunhao Ge","Jiashu Xu","Brian Nlong Zhao","Neel Joshi","Laurent Itti","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2309.05956v1.pdf","comment":"Code in https://github.com/gyhandy/Text2Image-for-Detection"},{"id":"http://arxiv.org/abs/2211.14154v6","updated":"2023-09-12T04:10:26Z","published":"2022-11-25T15:00:51Z","title":"Interaction Visual Transformer for Egocentric Action Anticipation","summary":" Human-object interaction is one of the most important visual cues and we\npropose a novel way to represent human-object interactions for egocentric\naction anticipation. We propose a novel transformer variant to model\ninteractions by computing the change in the appearance of objects and human\nhands due to the execution of the actions and use those changes to refine the\nvideo representation. Specifically, we model interactions between hands and\nobjects using Spatial Cross-Attention (SCA) and further infuse contextual\ninformation using Trajectory Cross-Attention to obtain environment-refined\ninteraction tokens. Using these tokens, we construct an interaction-centric\nvideo representation for action anticipation. We term our model InAViT which\nachieves state-of-the-art action anticipation performance on large-scale\negocentric datasets EPICKTICHENS100 (EK100) and EGTEA Gaze+. InAViT outperforms\nother visual transformer-based methods including object-centric video\nrepresentation. On the EK100 evaluation server, InAViT is the top-performing\nmethod on the public leaderboard (at the time of submission) where it\noutperforms the second-best model by 3.3% on mean-top5 recall.\n","authors":["Debaditya Roy","Ramanathan Rajendiran","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2211.14154v6.pdf","comment":"Top of the public leaderboard on EK100 Action Anticipation\n https://codalab.lisn.upsaclay.fr/competitions/702#results"},{"id":"http://arxiv.org/abs/2309.05950v1","updated":"2023-09-12T04:03:41Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities across a variety of vision and multimodal\ntasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box\nsetting, requiring access to model parameters for backpropagation. However,\nmany VLMs rely on proprietary data and are not open-source, which restricts the\nuse of white-box approaches for fine-tuning. Given that popular private large\nlanguage models (LLMs) like ChatGPT still offer a language-based user\ninterface, we aim to develop a novel fine-tuning approach for VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or output logits. In this setup, we propose employing\nchat-based LLMs as black-box optimizers to search for the best text prompt on\nthe illustrative task of few-shot image classification using CLIP.\nSpecifically, we adopt an automatic \"hill-climbing\" procedure that converges on\nan effective prompt by evaluating the accuracy of current prompts and asking\nLLMs to refine them based on textual feedback, all within a conversational\nprocess without human-in-the-loop. In a challenging 1-shot learning setup, our\nsimple approach surpasses the white-box continuous prompting method CoOp by an\naverage of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms OpenAI's manually crafted prompts and is more efficient than other\nblack-box methods like iterative APE. Additionally, we highlight the advantage\nof conversational feedback incorporating both positive and negative prompts,\nsuggesting that LLMs can utilize the implicit \"gradient\" direction in textual\nfeedback for a more efficient search. Lastly, we find that the text prompts\ngenerated through our strategy are not only more interpretable but also\ntransfer well across different CLIP architectures in a black-box manner.\n","authors":["Samuel Yu","Shihong Liu","Zhiqiu Lin","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05943v1","updated":"2023-09-12T03:48:29Z","published":"2023-09-12T03:48:29Z","title":"Knowledge-Guided Short-Context Action Anticipation in Human-Centric\n Videos","summary":" This work focuses on anticipating long-term human actions, particularly using\nshort video segments, which can speed up editing workflows through improved\nsuggestions while fostering creativity by suggesting narratives. To this end,\nwe imbue a transformer network with a symbolic knowledge graph for action\nanticipation in video segments by boosting certain aspects of the transformer's\nattention mechanism at run-time. Demonstrated on two benchmark datasets,\nBreakfast and 50Salads, our approach outperforms current state-of-the-art\nmethods for long-term action anticipation using short video context by up to\n9%.\n","authors":["Sarthak Bhagat","Simon Stepputtis","Joseph Campbell","Katia Sycara"],"pdf_url":"https://arxiv.org/pdf/2309.05943v1.pdf","comment":"ICCV 2023 Workshop on AI for Creative Video Editing and Understanding"},{"id":"http://arxiv.org/abs/2309.05930v1","updated":"2023-09-12T03:05:06Z","published":"2023-09-12T03:05:06Z","title":"Combining deep learning and street view imagery to map smallholder crop\n types","summary":" Accurate crop type maps are an essential source of information for monitoring\nyield progress at scale, projecting global crop production, and planning\neffective policies. To date, however, crop type maps remain challenging to\ncreate in low and middle-income countries due to a lack of ground truth labels\nfor training machine learning models. Field surveys are the gold standard in\nterms of accuracy but require an often-prohibitively large amount of time,\nmoney, and statistical capacity. In recent years, street-level imagery, such as\nGoogle Street View, KartaView, and Mapillary, has become available around the\nworld. Such imagery contains rich information about crop types grown at\nparticular locations and times. In this work, we develop an automated system to\ngenerate crop type ground references using deep learning and Google Street View\nimagery. The method efficiently curates a set of street view images containing\ncrop fields, trains a model to predict crop type by utilizing weakly-labelled\nimages from disparate out-of-domain sources, and combines predicted labels with\nremote sensing time series to create a wall-to-wall crop type map. We show\nthat, in Thailand, the resulting country-wide map of rice, cassava, maize, and\nsugarcane achieves an accuracy of 93%. As the availability of roadside imagery\nexpands, our pipeline provides a way to map crop types at scale around the\nglobe, especially in underserved smallholder regions.\n","authors":["Jordi Laguarta","Thomas Friedel","Sherrie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05930v1.pdf","comment":"Submitted to AAAI-24: Special Track on AI for Social Impact"},{"id":"http://arxiv.org/abs/2309.05929v1","updated":"2023-09-12T03:05:00Z","published":"2023-09-12T03:05:00Z","title":"Introducing Shape Prior Module in Diffusion Model for Medical Image\n Segmentation","summary":" Medical image segmentation is critical for diagnosing and treating spinal\ndisorders. However, the presence of high noise, ambiguity, and uncertainty\nmakes this task highly challenging. Factors such as unclear anatomical\nboundaries, inter-class similarities, and irrational annotations contribute to\nthis challenge. Achieving both accurate and diverse segmentation templates is\nessential to support radiologists in clinical practice. In recent years,\ndenoising diffusion probabilistic modeling (DDPM) has emerged as a prominent\nresearch topic in computer vision. It has demonstrated effectiveness in various\nvision tasks, including image deblurring, super-resolution, anomaly detection,\nand even semantic representation generation at the pixel level. Despite the\nrobustness of existing diffusion models in visual generation tasks, they still\nstruggle with discrete masks and their various effects. To address the need for\naccurate and diverse spine medical image segmentation templates, we propose an\nend-to-end framework called VerseDiff-UNet, which leverages the denoising\ndiffusion probabilistic model (DDPM). Our approach integrates the diffusion\nmodel into a standard U-shaped architecture. At each step, we combine the\nnoise-added image with the labeled mask to guide the diffusion direction\naccurately towards the target region. Furthermore, to capture specific\nanatomical a priori information in medical images, we incorporate a shape a\npriori module. This module efficiently extracts structural semantic information\nfrom the input spine images. We evaluate our method on a single dataset of\nspine images acquired through X-ray imaging. Our results demonstrate that\nVerseDiff-UNet significantly outperforms other state-of-the-art methods in\nterms of accuracy while preserving the natural features and variations of\nanatomy.\n","authors":["Zhiqing Zhang","Guojia Fan","Tianyong Liu","Nan Li","Yuyang Liu","Ziyu Liu","Canwei Dong","Shoujun Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.05929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05665v2","updated":"2023-09-12T03:01:55Z","published":"2023-09-11T17:59:17Z","title":"Robot Parkour Learning","summary":" Parkour is a grand challenge for legged locomotion that requires robots to\novercome various obstacles rapidly in complex environments. Existing methods\ncan generate either diverse but blind locomotion skills or vision-based but\nspecialized skills by using reference animal data or complex rewards. However,\nautonomous parkour requires robots to learn generalizable skills that are both\nvision-based and diverse to perceive and react to various scenarios. In this\nwork, we propose a system for learning a single end-to-end vision-based parkour\npolicy of diverse parkour skills using a simple reward without any reference\nmotion data. We develop a reinforcement learning method inspired by direct\ncollocation to generate parkour skills, including climbing over high obstacles,\nleaping over large gaps, crawling beneath low barriers, squeezing through thin\nslits, and running. We distill these skills into a single vision-based parkour\npolicy and transfer it to a quadrupedal robot using its egocentric depth\ncamera. We demonstrate that our system can empower two different low-cost\nrobots to autonomously select and execute appropriate parkour skills to\ntraverse challenging real-world environments.\n","authors":["Ziwen Zhuang","Zipeng Fu","Jianren Wang","Christopher Atkeson","Soeren Schwertfeger","Chelsea Finn","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05665v2.pdf","comment":"CoRL 2023 (Oral). Project website at https://robot-parkour.github.io"},{"id":"http://arxiv.org/abs/2309.05919v1","updated":"2023-09-12T02:23:30Z","published":"2023-09-12T02:23:30Z","title":"Deep evidential fusion with uncertainty quantification and contextual\n discounting for multimodal medical image segmentation","summary":" Single-modality medical images generally do not contain enough information to\nreach an accurate and reliable diagnosis. For this reason, physicians generally\ndiagnose diseases based on multimodal medical images such as, e.g., PET/CT. The\neffective fusion of multimodal information is essential to reach a reliable\ndecision and explain how the decision is made as well. In this paper, we\npropose a fusion framework for multimodal medical image segmentation based on\ndeep learning and the Dempster-Shafer theory of evidence. In this framework,\nthe reliability of each single modality image when segmenting different objects\nis taken into account by a contextual discounting operation. The discounted\npieces of evidence from each modality are then combined by Dempster's rule to\nreach a final decision. Experimental results with a PET-CT dataset with\nlymphomas and a multi-MRI dataset with brain tumors show that our method\noutperforms the state-of-the-art methods in accuracy and reliability.\n","authors":["Ling Huang","Su Ruan","Pierre Decazes","Thierry Denoeux"],"pdf_url":"https://arxiv.org/pdf/2309.05919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05914v1","updated":"2023-09-12T02:04:36Z","published":"2023-09-12T02:04:36Z","title":"Medical Image Segmentation with Belief Function Theory and Deep Learning","summary":" Deep learning has shown promising contributions in medical image segmentation\nwith powerful learning and feature representation abilities. However, it has\nlimitations for reasoning with and combining imperfect (imprecise, uncertain,\nand partial) information. In this thesis, we study medical image segmentation\napproaches with belief function theory and deep learning, specifically focusing\non information modeling and fusion based on uncertain evidence.\n First, we review existing belief function theory-based medical image\nsegmentation methods and discuss their advantages and challenges. Second, we\npresent a semi-supervised medical image segmentation framework to decrease the\nuncertainty caused by the lack of annotations with evidential segmentation and\nevidence fusion. Third, we compare two evidential classifiers, evidential\nneural network and radial basis function network, and show the effectiveness of\nbelief function theory in uncertainty quantification; we use the two evidential\nclassifiers with deep neural networks to construct deep evidential models for\nlymphoma segmentation. Fourth, we present a multimodal medical image fusion\nframework taking into account the reliability of each MR image source when\nperforming different segmentation tasks using mass functions and contextual\ndiscounting.\n","authors":["Ling Huang"],"pdf_url":"https://arxiv.org/pdf/2309.05914v1.pdf","comment":"Ph.D. Thesis"},{"id":"http://arxiv.org/abs/2309.05911v1","updated":"2023-09-12T02:01:31Z","published":"2023-09-12T02:01:31Z","title":"Quality-Agnostic Deepfake Detection with Intra-model Collaborative\n Learning","summary":" Deepfake has recently raised a plethora of societal concerns over its\npossible security threats and dissemination of fake information. Much research\non deepfake detection has been undertaken. However, detecting low quality as\nwell as simultaneously detecting different qualities of deepfakes still remains\na grave challenge. Most SOTA approaches are limited by using a single specific\nmodel for detecting certain deepfake video quality type. When constructing\nmultiple models with prior information about video quality, this kind of\nstrategy incurs significant computational cost, as well as model and training\ndata overhead. Further, it cannot be scalable and practical to deploy in\nreal-world settings. In this work, we propose a universal intra-model\ncollaborative learning framework to enable the effective and simultaneous\ndetection of different quality of deepfakes. That is, our approach is the\nquality-agnostic deepfake detection method, dubbed QAD . In particular, by\nobserving the upper bound of general error expectation, we maximize the\ndependency between intermediate representations of images from different\nquality levels via Hilbert-Schmidt Independence Criterion. In addition, an\nAdversarial Weight Perturbation module is carefully devised to enable the model\nto be more robust against image corruption while boosting the overall model's\nperformance. Extensive experiments over seven popular deepfake datasets\ndemonstrate the superiority of our QAD model over prior SOTA benchmarks.\n","authors":["Binh M. Le","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2309.05911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00330v4","updated":"2023-09-12T01:56:17Z","published":"2022-12-01T07:32:56Z","title":"Reliable Joint Segmentation of Retinal Edema Lesions in OCT Images","summary":" Focusing on the complicated pathological features, such as blurred\nboundaries, severe scale differences between symptoms, background noise\ninterference, etc., in the task of retinal edema lesions joint segmentation\nfrom OCT images and enabling the segmentation results more reliable. In this\npaper, we propose a novel reliable multi-scale wavelet-enhanced transformer\nnetwork, which can provide accurate segmentation results with reliability\nassessment. Specifically, aiming at improving the model's ability to learn the\ncomplex pathological features of retinal edema lesions in OCT images, we\ndevelop a novel segmentation backbone that integrates a wavelet-enhanced\nfeature extractor network and a multi-scale transformer module of our newly\ndesigned. Meanwhile, to make the segmentation results more reliable, a novel\nuncertainty segmentation head based on the subjective logical evidential theory\nis introduced to generate the final segmentation results with a corresponding\noverall uncertainty evaluation score map. We conduct comprehensive experiments\non the public database of AI-Challenge 2018 for retinal edema lesions\nsegmentation, and the results show that our proposed method achieves better\nsegmentation accuracy with a high degree of reliability as compared to other\nstate-of-the-art segmentation approaches. The code will be released on:\nhttps://github.com/LooKing9218/ReliableRESeg.\n","authors":["Meng Wang","Kai Yu","Chun-Mei Feng","Ke Zou","Yanyu Xu","Qingquan Meng","Rick Siow Mong Goh","Yong Liu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2212.00330v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08202v2","updated":"2023-09-12T01:36:47Z","published":"2022-10-15T05:38:55Z","title":"IBL-NeRF: Image-Based Lighting Formulation of Neural Radiance Fields","summary":" We propose IBL-NeRF, which decomposes the neural radiance fields (NeRF) of\nlarge-scale indoor scenes into intrinsic components. Recent approaches further\ndecompose the baked radiance of the implicit volume into intrinsic components\nsuch that one can partially approximate the rendering equation. However, they\nare limited to representing isolated objects with a shared environment\nlighting, and suffer from computational burden to aggregate rays with Monte\nCarlo integration. In contrast, our prefiltered radiance field extends the\noriginal NeRF formulation to capture the spatial variation of lighting within\nthe scene volume, in addition to surface properties. Specifically, the scenes\nof diverse materials are decomposed into intrinsic components for rendering,\nnamely, albedo, roughness, surface normal, irradiance, and prefiltered\nradiance. All of the components are inferred as neural images from MLP, which\ncan model large-scale general scenes. Especially the prefiltered radiance\neffectively models the volumetric light field, and captures spatial variation\nbeyond a single environment light. The prefiltering aggregates rays in a set of\npredefined neighborhood sizes such that we can replace the costly Monte Carlo\nintegration of global illumination with a simple query from a neural image. By\nadopting NeRF, our approach inherits superior visual quality and multi-view\nconsistency for synthesized images as well as the intrinsic components. We\ndemonstrate the performance on scenes with complex object layouts and light\nconfigurations, which could not be processed in any of the previous works.\n","authors":["Changwoon Choi","Juhyeon Kim","Young Min Kim"],"pdf_url":"https://arxiv.org/pdf/2210.08202v2.pdf","comment":"Computer Graphics Forum (Pacific Graphics 2023)"},{"id":"http://arxiv.org/abs/2309.05904v1","updated":"2023-09-12T01:29:37Z","published":"2023-09-12T01:29:37Z","title":"Enhancing Representation in Radiography-Reports Foundation Model: A\n Granular Alignment Algorithm Using Masked Contrastive Learning","summary":" Recently, multi-modal vision-language foundation models have gained\nsignificant attention in the medical field. While these models offer great\nopportunities, they still face a number of challenges, such as the requirement\nfor fine-grained knowledge understanding in computer-aided diagnosis and\ncapability of utilizing very limited or no task-specific labeled data in\nreal-world clinical applications. In this study, we present MaCo, a novel\nmulti-modal medical foundation model that explores masked contrastive learning\nto achieve granular alignment and zero-shot learning for a variety of medical\nimaging tasks. MaCo incorporates a correlation weighting mechanism to adjust\nthe correlation between masked image patches and their corresponding reports,\nthereby enhancing the representation learning capabilities. We evaluate MaCo on\nsix well-known open-source X-ray datasets, and the experimental results show it\noutperforms seven state-of-the-art approaches for classification, segmentation,\nand zero-shot phase grounding, demonstrating its great potential to promote a\nwide range of medical image analysis tasks.\n","authors":["Weijian Huang","Hongyu Zhou","Cheng Li","Hao Yang","Jiarun Liu","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05900v1","updated":"2023-09-12T01:03:43Z","published":"2023-09-12T01:03:43Z","title":"Adversarial Attacks Assessment of Salient Object Detection via Symbolic\n Learning","summary":" Machine learning is at the center of mainstream technology and outperforms\nclassical approaches to handcrafted feature design. Aside from its learning\nprocess for artificial feature extraction, it has an end-to-end paradigm from\ninput to output, reaching outstandingly accurate results. However, security\nconcerns about its robustness to malicious and imperceptible perturbations have\ndrawn attention since its prediction can be changed entirely. Salient object\ndetection is a research area where deep convolutional neural networks have\nproven effective but whose trustworthiness represents a significant issue\nrequiring analysis and solutions to hackers' attacks. Brain programming is a\nkind of symbolic learning in the vein of good old-fashioned artificial\nintelligence. This work provides evidence that symbolic learning robustness is\ncrucial in designing reliable visual attention systems since it can withstand\neven the most intense perturbations. We test this evolutionary computation\nmethodology against several adversarial attacks and noise perturbations using\nstandard databases and a real-world problem of a shorebird called the Snowy\nPlover portraying a visual attention task. We compare our methodology with five\ndifferent deep learning approaches, proving that they do not match the symbolic\nparadigm regarding robustness. All neural networks suffer significant\nperformance losses, while brain programming stands its ground and remains\nunaffected. Also, by studying the Snowy Plover, we remark on the importance of\nsecurity in surveillance activities regarding wildlife protection and\nconservation.\n","authors":["Gustavo Olague","Roberto Pineda","Gerardo Ibarra-Vazquez","Matthieu Olague","Axel Martinez","Sambit Bakshi","Jonathan Vargas","Isnardo Reducindo"],"pdf_url":"https://arxiv.org/pdf/2309.05900v1.pdf","comment":"14 pages, 8 figures, 6 tables, IEEE Transactions on Emerging Topics\n in Computing, Accepted for publication"},{"id":"http://arxiv.org/abs/2211.07108v3","updated":"2023-09-12T00:55:32Z","published":"2022-11-14T04:51:05Z","title":"Recursive Cross-View: Use Only 2D Detectors to Achieve 3D Object\n Detection without 3D Annotations","summary":" Heavily relying on 3D annotations limits the real-world application of 3D\nobject detection. In this paper, we propose a method that does not demand any\n3D annotation, while being able to predict fully oriented 3D bounding boxes.\nOur method, called Recursive Cross-View (RCV), utilizes the three-view\nprinciple to convert 3D detection into multiple 2D detection tasks, requiring\nonly a subset of 2D labels. We propose a recursive paradigm, in which instance\nsegmentation and 3D bounding box generation by Cross-View are implemented\nrecursively until convergence. Specifically, our proposed method involves the\nuse of a frustum for each 2D bounding box, which is then followed by the\nrecursive paradigm that ultimately generates a fully oriented 3D box, along\nwith its corresponding class and score. Note that, class and score are given by\nthe 2D detector. Estimated on the SUN RGB-D and KITTI datasets, our method\noutperforms existing image-based approaches. To justify that our method can be\nquickly used to new tasks, we implement it on two real-world scenarios, namely\n3D human detection and 3D hand detection. As a result, two new 3D annotated\ndatasets are obtained, which means that RCV can be viewed as a (semi-)\nautomatic 3D annotator. Furthermore, we deploy RCV on a depth sensor, which\nachieves detection at 7 fps on a live RGB-D stream. RCV is the first 3D\ndetection method that yields fully oriented 3D boxes without consuming 3D\nlabels.\n","authors":["Shun Gui","Yan Luximon"],"pdf_url":"https://arxiv.org/pdf/2211.07108v3.pdf","comment":"Accepted by R-AL"},{"id":"http://arxiv.org/abs/2309.05883v1","updated":"2023-09-12T00:07:08Z","published":"2023-09-12T00:07:08Z","title":"Hierarchical Conditional Semi-Paired Image-to-Image Translation For\n Multi-Task Image Defect Correction On Shopping Websites","summary":" On shopping websites, product images of low quality negatively affect\ncustomer experience. Although there are plenty of work in detecting images with\ndifferent defects, few efforts have been dedicated to correct those defects at\nscale. A major challenge is that there are thousands of product types and each\nhas specific defects, therefore building defect specific models is unscalable.\nIn this paper, we propose a unified Image-to-Image (I2I) translation model to\ncorrect multiple defects across different product types. Our model leverages an\nattention mechanism to hierarchically incorporate high-level defect groups and\nspecific defect types to guide the network to focus on defect-related image\nregions. Evaluated on eight public datasets, our model reduces the Frechet\nInception Distance (FID) by 24.6% in average compared with MoNCE, the\nstate-of-the-art I2I method. Unlike public data, another practical challenge on\nshopping websites is that some paired images are of low quality. Therefore we\ndesign our model to be semi-paired by combining the L1 loss of paired data with\nthe cycle loss of unpaired data. Tested on a shopping website dataset to\ncorrect three image defects, our model reduces (FID) by 63.2% in average\ncompared with WS-I2I, the state-of-the art semi-paired I2I method.\n","authors":["Moyan Li","Jinmiao Fu","Shaoyuan Xu","Huidong Liu","Jia Liu","Bryan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05883v1.pdf","comment":"6 pages, 6 figures, 3 tables. To be published in ICIP 2023"},{"id":"http://arxiv.org/abs/2309.05879v1","updated":"2023-09-12T00:00:24Z","published":"2023-09-12T00:00:24Z","title":"Generalized Attacks on Face Verification Systems","summary":" Face verification (FV) using deep neural network models has made tremendous\nprogress in recent years, surpassing human accuracy and seeing deployment in\nvarious applications such as border control and smartphone unlocking. However,\nFV systems are vulnerable to Adversarial Attacks, which manipulate input images\nto deceive these systems in ways usually unnoticeable to humans. This paper\nprovides an in-depth study of attacks on FV systems. We introduce the\nDodgePersonation Attack that formulates the creation of face images that\nimpersonate a set of given identities while avoiding being identified as any of\nthe identities in a separate, disjoint set. A taxonomy is proposed to provide a\nunified view of different types of Adversarial Attacks against FV systems,\nincluding Dodging Attacks, Impersonation Attacks, and Master Face Attacks.\nFinally, we propose the ''One Face to Rule Them All'' Attack which implements\nthe DodgePersonation Attack with state-of-the-art performance on a well-known\nscenario (Master Face Attack) and which can also be used for the new scenarios\nintroduced in this paper. While the state-of-the-art Master Face Attack can\nproduce a set of 9 images to cover 43.82% of the identities in their test\ndatabase, with 9 images our attack can cover 57.27% to 58.5% of these\nidentifies while giving the attacker the choice of the identity to use to\ncreate the impersonation. Moreover, the 9 generated attack images appear\nidentical to a casual observer.\n","authors":["Ehsan Nazari","Paula Branco","Guy-Vincent Jourdan"],"pdf_url":"https://arxiv.org/pdf/2309.05879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.15651v2","updated":"2023-09-12T22:33:53Z","published":"2023-03-28T00:20:37Z","title":"4D Panoptic Segmentation as Invariant and Equivariant Field Prediction","summary":" In this paper, we develop rotation-equivariant neural networks for 4D\npanoptic segmentation. 4D panoptic segmentation is a benchmark task for\nautonomous driving that requires recognizing semantic classes and object\ninstances on the road based on LiDAR scans, as well as assigning temporally\nconsistent IDs to instances across time. We observe that the driving scenario\nis symmetric to rotations on the ground plane. Therefore, rotation-equivariance\ncould provide better generalization and more robust feature learning.\nSpecifically, we review the object instance clustering strategies and restate\nthe centerness-based approach and the offset-based approach as the prediction\nof invariant scalar fields and equivariant vector fields. Other sub-tasks are\nalso unified from this perspective, and different invariant and equivariant\nlayers are designed to facilitate their predictions. Through evaluation on the\nstandard 4D panoptic segmentation benchmark of SemanticKITTI, we show that our\nequivariant models achieve higher accuracy with lower computational costs\ncompared to their non-equivariant counterparts. Moreover, our method sets the\nnew state-of-the-art performance and achieves 1st place on the SemanticKITTI 4D\nPanoptic Segmentation leaderboard.\n","authors":["Minghan Zhu","Shizhong Han","Hong Cai","Shubhankar Borse","Maani Ghaffari","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2303.15651v2.pdf","comment":"13 pages. Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2309.06626v1","updated":"2023-09-12T22:28:53Z","published":"2023-09-12T22:28:53Z","title":"Accelerating Deep Neural Networks via Semi-Structured Activation\n Sparsity","summary":" The demand for efficient processing of deep neural networks (DNNs) on\nembedded devices is a significant challenge limiting their deployment.\nExploiting sparsity in the network's feature maps is one of the ways to reduce\nits inference latency. It is known that unstructured sparsity results in lower\naccuracy degradation with respect to structured sparsity but the former needs\nextensive inference engine changes to get latency benefits. To tackle this\nchallenge, we propose a solution to induce semi-structured activation sparsity\nexploitable through minor runtime modifications. To attain high speedup levels\nat inference time, we design a sparse training procedure with awareness of the\nfinal position of the activations while computing the General Matrix\nMultiplication (GEMM). We extensively evaluate the proposed solution across\nvarious models for image classification and object detection tasks. Remarkably,\nour approach yields a speed improvement of $1.25 \\times$ with a minimal\naccuracy drop of $1.1\\%$ for the ResNet18 model on the ImageNet dataset.\nFurthermore, when combined with a state-of-the-art structured pruning method,\nthe resulting models provide a good latency-accuracy trade-off, outperforming\nmodels that solely employ structured pruning techniques.\n","authors":["Matteo Grimaldi","Darshan C. Ganji","Ivan Lazarevich","Sudhakar Sah"],"pdf_url":"https://arxiv.org/pdf/2309.06626v1.pdf","comment":"Code is available at http://github.com/Deeplite/activ-sparse"},{"id":"http://arxiv.org/abs/2309.06618v1","updated":"2023-09-12T22:21:14Z","published":"2023-09-12T22:21:14Z","title":"Multi-dimensional Fusion and Consistency for Semi-supervised Medical\n Image Segmentation","summary":" In this paper, we introduce a novel semi-supervised learning framework\ntailored for medical image segmentation. Central to our approach is the\ninnovative Multi-scale Text-aware ViT-CNN Fusion scheme. This scheme adeptly\ncombines the strengths of both ViTs and CNNs, capitalizing on the unique\nadvantages of both architectures as well as the complementary information in\nvision-language modalities. Further enriching our framework, we propose the\nMulti-Axis Consistency framework for generating robust pseudo labels, thereby\nenhancing the semi-supervised learning process. Our extensive experiments on\nseveral widely-used datasets unequivocally demonstrate the efficacy of our\napproach.\n","authors":["Yixing Lu","Zhaoxin Fan","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2309.06618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06612v1","updated":"2023-09-12T21:37:26Z","published":"2023-09-12T21:37:26Z","title":"Harmonic-NAS: Hardware-Aware Multimodal Neural Architecture Search on\n Resource-constrained Devices","summary":" The recent surge of interest surrounding Multimodal Neural Networks (MM-NN)\nis attributed to their ability to effectively process and integrate information\nfrom diverse data sources. In MM-NN, features are extracted and fused from\nmultiple modalities using adequate unimodal backbones and specific fusion\nnetworks. Although this helps strengthen the multimodal information\nrepresentation, designing such networks is labor-intensive. It requires tuning\nthe architectural parameters of the unimodal backbones, choosing the fusing\npoint, and selecting the operations for fusion. Furthermore, multimodality AI\nis emerging as a cutting-edge option in Internet of Things (IoT) systems where\ninference latency and energy consumption are critical metrics in addition to\naccuracy. In this paper, we propose Harmonic-NAS, a framework for the joint\noptimization of unimodal backbones and multimodal fusion networks with hardware\nawareness on resource-constrained devices. Harmonic-NAS involves a two-tier\noptimization approach for the unimodal backbone architectures and fusion\nstrategy and operators. By incorporating the hardware dimension into the\noptimization, evaluation results on various devices and multimodal datasets\nhave demonstrated the superiority of Harmonic-NAS over state-of-the-art\napproaches achieving up to 10.9% accuracy improvement, 1.91x latency reduction,\nand 2.14x energy efficiency gain.\n","authors":["Mohamed Imed Eddine Ghebriout","Halima Bouzidi","Smail Niar","Hamza Ouarnoughi"],"pdf_url":"https://arxiv.org/pdf/2309.06612v1.pdf","comment":"Accepted to the 15th Asian Conference on Machine Learning (ACML 2023)"},{"id":"http://arxiv.org/abs/2307.14336v2","updated":"2023-09-12T21:28:51Z","published":"2023-07-26T17:55:32Z","title":"MAMo: Leveraging Memory and Attention for Monocular Video Depth\n Estimation","summary":" We propose MAMo, a novel memory and attention frame-work for monocular video\ndepth estimation. MAMo can augment and improve any single-image depth\nestimation networks into video depth estimation models, enabling them to take\nadvantage of the temporal information to predict more accurate depth. In MAMo,\nwe augment model with memory which aids the depth prediction as the model\nstreams through the video. Specifically, the memory stores learned visual and\ndisplacement tokens of the previous time instances. This allows the depth\nnetwork to cross-reference relevant features from the past when predicting\ndepth on the current frame. We introduce a novel scheme to continuously update\nthe memory, optimizing it to keep tokens that correspond with both the past and\nthe present visual information. We adopt attention-based approach to process\nmemory features where we first learn the spatio-temporal relation among the\nresultant visual and displacement memory tokens using self-attention module.\nFurther, the output features of self-attention are aggregated with the current\nvisual features through cross-attention. The cross-attended features are\nfinally given to a decoder to predict depth on the current frame. Through\nextensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and\nDDAD, we show that MAMo consistently improves monocular depth estimation\nnetworks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video\ndepth estimation provides higher accuracy with lower latency, when omparing to\nSOTA cost-volume-based video depth models.\n","authors":["Rajeev Yasarla","Hong Cai","Jisoo Jeong","Yunxiao Shi","Risheek Garrepalli","Fatih Porikli"],"pdf_url":"https://arxiv.org/pdf/2307.14336v2.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2207.04320v3","updated":"2023-09-12T21:21:35Z","published":"2022-07-09T18:42:14Z","title":"Snipper: A Spatiotemporal Transformer for Simultaneous Multi-Person 3D\n Pose Estimation Tracking and Forecasting on a Video Snippet","summary":" Multi-person pose understanding from RGB videos involves three complex tasks:\npose estimation, tracking and motion forecasting. Intuitively, accurate\nmulti-person pose estimation facilitates robust tracking, and robust tracking\nbuilds crucial history for correct motion forecasting. Most existing works\neither focus on a single task or employ multi-stage approaches to solving\nmultiple tasks separately, which tends to make sub-optimal decision at each\nstage and also fail to exploit correlations among the three tasks. In this\npaper, we propose Snipper, a unified framework to perform multi-person 3D pose\nestimation, tracking, and motion forecasting simultaneously in a single stage.\nWe propose an efficient yet powerful deformable attention mechanism to\naggregate spatiotemporal information from the video snippet. Building upon this\ndeformable attention, a video transformer is learned to encode the\nspatiotemporal features from the multi-frame snippet and to decode informative\npose features for multi-person pose queries. Finally, these pose queries are\nregressed to predict multi-person pose trajectories and future motions in a\nsingle shot. In the experiments, we show the effectiveness of Snipper on three\nchallenging public datasets where our generic model rivals specialized\nstate-of-art baselines for pose estimation, tracking, and forecasting.\n","authors":["Shihao Zou","Yuanlu Xu","Chao Li","Lingni Ma","Li Cheng","Minh Vo"],"pdf_url":"https://arxiv.org/pdf/2207.04320v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01188v5","updated":"2023-09-12T21:14:05Z","published":"2023-06-01T22:57:32Z","title":"Event-based Stereo Visual Odometry with Native Temporal Resolution via\n Continuous-time Gaussian Process Regression","summary":" Event-based cameras asynchronously capture individual visual changes in a\nscene. This makes them more robust than traditional frame-based cameras to\nhighly dynamic motions and poor illumination. It also means that every\nmeasurement in a scene can occur at a unique time.\n Handling these different measurement times is a major challenge of using\nevent-based cameras. It is often addressed in visual odometry (VO) pipelines by\napproximating temporally close measurements as occurring at one common time.\nThis grouping simplifies the estimation problem but, absent additional sensors,\nsacrifices the inherent temporal resolution of event-based cameras.\n This paper instead presents a complete stereo VO pipeline that estimates\ndirectly with individual event-measurement times without requiring any grouping\nor approximation in the estimation state. It uses continuous-time trajectory\nestimation to maintain the temporal fidelity and asynchronous nature of\nevent-based cameras through Gaussian process regression with a physically\nmotivated prior. Its performance is evaluated on the MVSEC dataset, where it\nachieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences,\noutperforming the existing publicly available event-based stereo VO pipeline by\ntwo and four times, respectively.\n","authors":["Jianeng Wang","Jonathan D. Gammell"],"pdf_url":"https://arxiv.org/pdf/2306.01188v5.pdf","comment":"8 pages, 4 figures. DOI: 10.1109/LRA.2023.3311374"},{"id":"http://arxiv.org/abs/2306.01198v2","updated":"2023-09-12T21:11:28Z","published":"2023-06-01T23:23:37Z","title":"Confidence Intervals for Error Rates in 1:1 Matching Tasks: Critical\n Statistical Analysis and Recommendations","summary":" Matching algorithms are commonly used to predict matches between items in a\ncollection. For example, in 1:1 face verification, a matching algorithm\npredicts whether two face images depict the same person. Accurately assessing\nthe uncertainty of the error rates of such algorithms can be challenging when\ndata are dependent and error rates are low, two aspects that have been often\noverlooked in the literature. In this work, we review methods for constructing\nconfidence intervals for error rates in 1:1 matching tasks. We derive and\nexamine the statistical properties of these methods, demonstrating how coverage\nand interval width vary with sample size, error rates, and degree of data\ndependence on both analysis and experiments with synthetic and real-world\ndatasets. Based on our findings, we provide recommendations for best practices\nfor constructing confidence intervals for error rates in 1:1 matching tasks.\n","authors":["Riccardo Fogliato","Pratik Patil","Pietro Perona"],"pdf_url":"https://arxiv.org/pdf/2306.01198v2.pdf","comment":"32 pages, 8 figures"},{"id":"http://arxiv.org/abs/2303.18013v2","updated":"2023-09-12T20:59:10Z","published":"2023-03-31T12:38:08Z","title":"LaCViT: A Label-aware Contrastive Training Framework for Vision\n Transformers","summary":" Vision Transformers have been incredibly effective when tackling computer\nvision tasks due to their ability to model long feature dependencies. By using\nlarge-scale training data and various self-supervised signals (e.g., masked\nrandom patches), vision transformers provide state-of-the-art performance on\nseveral benchmarking datasets, such as ImageNet-1k and CIFAR-10. However, these\nvision transformers pretrained over general large-scale image corpora could\nonly produce an anisotropic representation space, limiting their\ngeneralizability and transferability to the target downstream tasks. In this\npaper, we propose a simple and effective Label-aware Contrastive Training\nframework LaCViT, which improves the isotropy of the pretrained representation\nspace for vision transformers, thereby enabling more effective transfer\nlearning amongst a wide range of image classification tasks. Through\nexperimentation over five standard image classification datasets, we\ndemonstrate that LaCViT-trained models outperform the original pretrained\nbaselines by around 9% absolute Accuracy@1, and consistent improvements can be\nobserved when applying LaCViT to our three evaluated vision transformers.\n","authors":["Zijun Long","Zaiqiao Meng","Gerardo Aragon Camarasa","Richard McCreadie"],"pdf_url":"https://arxiv.org/pdf/2303.18013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01111v2","updated":"2023-09-12T20:58:35Z","published":"2023-06-01T19:52:33Z","title":"Exploring the Versatility of Zero-Shot CLIP for Interstitial Lung\n Disease Classification","summary":" Interstitial lung diseases (ILD) present diagnostic challenges due to their\nvaried manifestations and overlapping imaging features. To address this, we\npropose a machine learning approach that utilizes CLIP, a multimodal (image and\ntext) self-supervised model, for ILD classification. We extensively integrate\nzero-shot CLIP throughout our workflow, starting from the initial extraction of\nimage patches from volumetric CT scans and proceeding to ILD classification\nusing \"patch montages\". Furthermore, we investigate how domain adaptive\npretraining (DAPT) CLIP with task-specific images (CT \"patch montages\"\nextracted with ILD-specific prompts for CLIP) and/or text (lung-specific\nsections of radiology reports) affects downstream ILD classification\nperformance. By leveraging CLIP-extracted \"patch montages\" and DAPT, we achieve\nstrong zero-shot ILD classification results, including an AUROC of 0.893,\nwithout the need for any labeled training data. This work highlights the\nversatility and potential of multimodal models like CLIP for medical image\nclassification tasks where labeled data is scarce.\n","authors":["Cara Van Uden","Christian Bluethgen","Maayane Attias","Malgorzata Polacin","Haiwei Henry Guo","Neha Simha","Rishi Raj","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2306.01111v2.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.13561v2","updated":"2023-09-12T20:56:01Z","published":"2023-08-24T20:42:21Z","title":"Project Aria: A New Tool for Egocentric Multi-Modal AI Research","summary":" Egocentric, multi-modal data as available on future augmented reality (AR)\ndevices provides unique challenges and opportunities for machine perception.\nThese future devices will need to be all-day wearable in a socially acceptable\nform-factor to support always available, context-aware and personalized AI\napplications. Our team at Meta Reality Labs Research built the Aria device, an\negocentric, multi-modal data recording and streaming device with the goal to\nfoster and accelerate research in this area. In this paper, we describe the\nAria device hardware including its sensor configuration and the corresponding\nsoftware tools that enable recording and processing of such data.\n","authors":["Kiran Somasundaram","Jing Dong","Huixuan Tang","Julian Straub","Mingfei Yan","Michael Goesele","Jakob Julian Engel","Renzo De Nardi","Richard Newcombe"],"pdf_url":"https://arxiv.org/pdf/2308.13561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v3","updated":"2023-09-12T20:51:09Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, EDM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\nThe code is available at https://github.com/forever208/ADM-ES;\nhttps://github.com/forever208/EDM-ES\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2309.06597v1","updated":"2023-09-12T20:51:07Z","published":"2023-09-12T20:51:07Z","title":"Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and\n Reasoning","summary":" The widespread adoption of commercial autonomous vehicles (AVs) and advanced\ndriver assistance systems (ADAS) may largely depend on their acceptance by\nsociety, for which their perceived trustworthiness and interpretability to\nriders are crucial. In general, this task is challenging because modern\nautonomous systems software relies heavily on black-box artificial intelligence\nmodels. Towards this goal, this paper introduces a novel dataset, Rank2Tell, a\nmulti-modal ego-centric dataset for Ranking the importance level and Telling\nthe reason for the importance. Using various close and open-ended visual\nquestion answering, the dataset provides dense annotations of various semantic,\nspatial, temporal, and relational attributes of various important objects in\ncomplex traffic scenarios. The dense annotations and unique attributes of the\ndataset make it a valuable resource for researchers working on visual scene\nunderstanding and related fields. Further, we introduce a joint model for joint\nimportance level ranking and natural language captions generation to benchmark\nour dataset and demonstrate performance with quantitative evaluations.\n","authors":["Enna Sachdeva","Nakul Agarwal","Suhas Chundi","Sean Roelofs","Jiachen Li","Behzad Dariush","Chiho Choi","Mykel Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2309.06597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01516v2","updated":"2023-09-12T20:16:04Z","published":"2023-09-04T10:48:29Z","title":"MultiWay-Adapater: Adapting large-scale multi-modal models for scalable\n image-text retrieval","summary":" As the size of Large Multi-Modal Models (LMMs) increases consistently, the\nadaptation of these pre-trained models to specialized tasks has become a\ncomputationally and memory-intensive challenge. Traditional fine-tuning methods\nrequire isolated, exhaustive retuning for each new task, limiting the models'\nversatility. Moreover, current efficient adaptation techniques often overlook\nmodality alignment, focusing only on the knowledge extraction of new tasks. To\ntackle these issues, we introduce Multiway-Adapter, an innovative framework\nincorporating an 'Alignment Enhancer' to deepen modality alignment, enabling\nhigh transferability without tuning pre-trained parameters. Our method adds\nfewer than 1.25\\% of additional parameters to LMMs, exemplified by the BEiT-3\nmodel in our study. This leads to superior zero-shot image-text retrieval\nperformance compared to fully fine-tuned models, while achieving up to a 57\\%\nreduction in fine-tuning time. Our approach offers a resource-efficient and\neffective adaptation pathway for LMMs, broadening their applicability. The\nsource code is publicly available at:\n\\url{https://github.com/longkukuhi/MultiWay-Adapter}.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa"],"pdf_url":"https://arxiv.org/pdf/2309.01516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06581v1","updated":"2023-09-12T20:09:12Z","published":"2023-09-12T20:09:12Z","title":"Zero-Shot Visual Classification with Guided Cropping","summary":" Pretrained vision-language models, such as CLIP, show promising zero-shot\nperformance across a wide variety of datasets. For closed-set classification\ntasks, however, there is an inherent limitation: CLIP image encoders are\ntypically designed to extract generic image-level features that summarize\nsuperfluous or confounding information for the target tasks. This results in\ndegradation of classification performance, especially when objects of interest\ncover small areas of input images. In this work, we propose CLIP with Guided\nCropping (GC-CLIP), where we use an off-the-shelf zero-shot object detection\nmodel in a preprocessing step to increase focus of zero-shot classifier to the\nobject of interest and minimize influence of extraneous image regions. We\nempirically show that our approach improves zero-shot classification results\nacross architectures and datasets, favorably for small objects.\n","authors":["Piyapat Saranrittichai","Mauricio Munoz","Volker Fischer","Chaithanya Kumar Mummadi"],"pdf_url":"https://arxiv.org/pdf/2309.06581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.08123v3","updated":"2023-09-12T20:07:20Z","published":"2021-09-16T17:21:52Z","title":"Neural Étendue Expander for Ultra-Wide-Angle High-Fidelity\n Holographic Display","summary":" Holographic displays can generate light fields by dynamically modulating the\nwavefront of a coherent beam of light using a spatial light modulator,\npromising rich virtual and augmented reality applications. However, the limited\nspatial resolution of existing dynamic spatial light modulators imposes a tight\nbound on the diffraction angle. As a result, modern holographic displays\npossess low \\'{e}tendue, which is the product of the display area and the\nmaximum solid angle of diffracted light. The low \\'{e}tendue forces a sacrifice\nof either the field-of-view (FOV) or the display size. In this work, we lift\nthis limitation by presenting neural \\'{e}tendue expanders. This new breed of\noptical elements, which is learned from a natural image dataset, enables higher\ndiffraction angles for ultra-wide FOV while maintaining both a compact form\nfactor and the fidelity of displayed contents to human viewers. With neural\n\\'{e}tendue expanders, we experimentally achieve 64$\\times$ \\'{e}tendue\nexpansion of natural images in full color, expanding the FOV by an order of\nmagnitude horizontally and vertically, with high-fidelity reconstruction\nquality (measured in PSNR) over 29 dB on retinal-resolution images.\n","authors":["Ethan Tseng","Seung-Hwan Baek","Grace Kuo","Nathan Matsuda","Andrew Maimone","Florian Schiffers","Praneeth Chakravarthula","Qiang Fu","Wolfgang Heidrich","Douglas Lanman","Felix Heide"],"pdf_url":"https://arxiv.org/pdf/2109.08123v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13902v2","updated":"2023-09-12T19:49:09Z","published":"2022-08-29T21:50:43Z","title":"Radial Prediction Domain Adaption Classifier for the MIDOG 2022\n Challenge","summary":" This paper describes our contribution to the MIDOG 2022 challenge for\ndetecting mitotic cells. One of the major problems to be addressed in the MIDOG\n2022 challenge is the robustness under the natural variance that appears for\nreal-life data in the histopathology field. To address the problem, we use an\nadapted YOLOv5s model for object detection in conjunction with a new Domain\nAdaption Classifier (DAC) variant, the Radial-Prediction-DAC, to achieve\nrobustness under domain shifts. In addition, we increase the variability of the\navailable training data using stain augmentation in HED color space. Using the\nsuggested method, we obtain a test set F1-score of 0.6658.\n","authors":["Jonas Annuscheit","Christian Krumnow"],"pdf_url":"https://arxiv.org/pdf/2208.13902v2.pdf","comment":"Contribution to the MIDOG-2022-Challenge"},{"id":"http://arxiv.org/abs/2309.06547v1","updated":"2023-09-12T19:46:15Z","published":"2023-09-12T19:46:15Z","title":"AmodalSynthDrive: A Synthetic Amodal Perception Dataset for Autonomous\n Driving","summary":" Unlike humans, who can effortlessly estimate the entirety of objects even\nwhen partially occluded, modern computer vision algorithms still find this\naspect extremely challenging. Leveraging this amodal perception for autonomous\ndriving remains largely untapped due to the lack of suitable datasets. The\ncuration of these datasets is primarily hindered by significant annotation\ncosts and mitigating annotator subjectivity in accurately labeling occluded\nregions. To address these limitations, we introduce AmodalSynthDrive, a\nsynthetic multi-task multi-modal amodal perception dataset. The dataset\nprovides multi-view camera images, 3D bounding boxes, LiDAR data, and odometry\nfor 150 driving sequences with over 1M object annotations in diverse traffic,\nweather, and lighting conditions. AmodalSynthDrive supports multiple amodal\nscene understanding tasks including the introduced amodal depth estimation for\nenhanced spatial understanding. We evaluate several baselines for each of these\ntasks to illustrate the challenges and set up public benchmarking servers. The\ndataset is available at http://amodalsynthdrive.cs.uni-freiburg.de.\n","authors":["Ahmed Rida Sekkat","Rohit Mohan","Oliver Sawade","Elmar Matthes","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2309.06547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.04492v2","updated":"2023-09-12T19:31:07Z","published":"2022-12-08T18:59:02Z","title":"Few-View Object Reconstruction with Unknown Categories and Camera Poses","summary":" While object reconstruction has made great strides in recent years, current\nmethods typically require densely captured images and/or known camera poses,\nand generalize poorly to novel object categories. To step toward object\nreconstruction in the wild, this work explores reconstructing general\nreal-world objects from a few images without known camera poses or object\ncategories. The crux of our work is solving two fundamental 3D vision problems\n-- shape reconstruction and pose estimation -- in a unified approach. Our\napproach captures the synergies of these two problems: reliable camera pose\nestimation gives rise to accurate shape reconstruction, and the accurate\nreconstruction, in turn, induces robust correspondence between different views\nand facilitates pose estimation. Our method FORGE predicts 3D features from\neach view and leverages them in conjunction with the input images to establish\ncross-view correspondence for estimating relative camera poses. The 3D features\nare then transformed by the estimated poses into a shared space and are fused\ninto a neural radiance field. The reconstruction results are rendered by volume\nrendering techniques, enabling us to train the model without 3D shape\nground-truth. Our experiments show that FORGE reliably reconstructs objects\nfrom five views. Our pose estimation method outperforms existing ones by a\nlarge margin. The reconstruction results under predicted poses are comparable\nto the ones using ground-truth poses. The performance on novel testing\ncategories matches the results on categories seen during training. Project\npage: https://ut-austin-rpl.github.io/FORGE/\n","authors":["Hanwen Jiang","Zhenyu Jiang","Kristen Grauman","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2212.04492v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06528v1","updated":"2023-09-12T19:08:54Z","published":"2023-09-12T19:08:54Z","title":"Strong-Weak Integrated Semi-supervision for Unsupervised Single and\n Multi Target Domain Adaptation","summary":" Unsupervised domain adaptation (UDA) focuses on transferring knowledge\nlearned in the labeled source domain to the unlabeled target domain. Despite\nsignificant progress that has been achieved in single-target domain adaptation\nfor image classification in recent years, the extension from single-target to\nmulti-target domain adaptation is still a largely unexplored problem area. In\ngeneral, unsupervised domain adaptation faces a major challenge when attempting\nto learn reliable information from a single unlabeled target domain. Increasing\nthe number of unlabeled target domains further exacerbate the problem rather\nsignificantly. In this paper, we propose a novel strong-weak integrated\nsemi-supervision (SWISS) learning strategy for image classification using\nunsupervised domain adaptation that works well for both single-target and\nmulti-target scenarios. Under the proposed SWISS-UDA framework, a strong\nrepresentative set with high confidence but low diversity target domain samples\nand a weak representative set with low confidence but high diversity target\ndomain samples are updated constantly during the training process. Both sets\nare fused to generate an augmented strong-weak training batch with\npseudo-labels to train the network during every iteration. The extension from\nsingle-target to multi-target domain adaptation is accomplished by exploring\nthe class-wise distance relationship between domains and replacing the strong\nrepresentative set with much stronger samples from peer domains via peer\nscaffolding. Moreover, a novel adversarial logit loss is proposed to reduce the\nintra-class divergence between source and target domains, which is\nback-propagated adversarially with a gradient reverse layer between the\nclassifier and the rest of the network. Experimental results based on three\nbenchmarks, Office-31, Office-Home, and DomainNet, show the effectiveness of\nthe proposed SWISS framework.\n","authors":["Xiaohu Lu","Hayder Radha"],"pdf_url":"https://arxiv.org/pdf/2309.06528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06121v2","updated":"2023-09-12T19:07:51Z","published":"2023-05-10T13:11:23Z","title":"Transformer-based model for monocular visual odometry: a video\n understanding approach","summary":" Estimating the camera's pose given images of a single camera is a traditional\ntask in mobile robots and autonomous vehicles. This problem is called monocular\nvisual odometry and it often relies on geometric approaches that require\nconsiderable engineering effort for a specific scenario. Deep learning methods\nhave shown to be generalizable after proper training and a large amount of\navailable data. Transformer-based architectures have dominated the\nstate-of-the-art in natural language processing and computer vision tasks, such\nas image and video understanding. In this work, we deal with the monocular\nvisual odometry as a video understanding task to estimate the 6-DoF camera's\npose. We contribute by presenting the TSformer-VO model based on\nspatio-temporal self-attention mechanisms to extract features from clips and\nestimate the motions in an end-to-end manner. Our approach achieved competitive\nstate-of-the-art performance compared with geometry-based and deep\nlearning-based methods on the KITTI visual odometry dataset, outperforming the\nDeepVO implementation highly accepted in the visual odometry community.\n","authors":["André O. Françani","Marcos R. O. A. Maximo"],"pdf_url":"https://arxiv.org/pdf/2305.06121v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06521v1","updated":"2023-09-12T18:51:28Z","published":"2023-09-12T18:51:28Z","title":"Ethnicity and Biometric Uniqueness: Iris Pattern Individuality in a West\n African Database","summary":" We conducted more than 1.3 million comparisons of iris patterns encoded from\nimages collected at two Nigerian universities, which constitute the newly\navailable African Human Iris (AFHIRIS) database. The purpose was to discover\nwhether ethnic differences in iris structure and appearance such as the\ntextural feature size, as contrasted with an all-Chinese image database or an\nAmerican database in which only 1.53% were of African-American heritage, made a\nmaterial difference for iris discrimination. We measured a reduction in entropy\nfor the AFHIRIS database due to the coarser iris features created by the thick\nanterior layer of melanocytes, and we found stochastic parameters that\naccurately model the relevant empirical distributions. Quantile-Quantile\nanalysis revealed that a very small change in operational decision thresholds\nfor the African database would compensate for the reduced entropy and generate\nthe same performance in terms of resistance to False Matches. We conclude that\ndespite demographic difference, individuality can be robustly discerned by\ncomparison of iris patterns in this West African population.\n","authors":["John Daugman","Cathryn Downing","Oluwatobi Noah Akande","Oluwakemi Christiana Abikoye"],"pdf_url":"https://arxiv.org/pdf/2309.06521v1.pdf","comment":"8 pages, 8 Figures"},{"id":"http://arxiv.org/abs/2307.10475v2","updated":"2023-09-12T18:51:05Z","published":"2023-07-19T22:14:49Z","title":"Findings of Factify 2: Multimodal Fake News Detection","summary":" With social media usage growing exponentially in the past few years, fake\nnews has also become extremely prevalent. The detrimental impact of fake news\nemphasizes the need for research focused on automating the detection of false\ninformation and verifying its accuracy. In this work, we present the outcome of\nthe Factify 2 shared task, which provides a multi-modal fact verification and\nsatire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data\ncalls for a comparison based approach to the task by pairing social media\nclaims with supporting documents, with both text and image, divided into 5\nclasses based on multi-modal relations. In the second iteration of this task we\nhad over 60 participants and 9 final test-set submissions. The best\nperformances came from the use of DeBERTa for text and Swinv2 and CLIP for\nimage. The highest F1 score averaged for all five classes was 81.82%.\n","authors":["S Suryavardan","Shreyash Mishra","Megha Chakraborty","Parth Patwa","Anku Rani","Aman Chadha","Aishwarya Reganti","Amitava Das","Amit Sheth","Manoj Chinnakotla","Asif Ekbal","Srijan Kumar"],"pdf_url":"https://arxiv.org/pdf/2307.10475v2.pdf","comment":"Defactify2 @AAAI 2023"},{"id":"http://arxiv.org/abs/2308.13442v2","updated":"2023-09-12T18:41:16Z","published":"2023-08-25T15:42:19Z","title":"Unlocking Fine-Grained Details with Wavelet-based High-Frequency\n Enhancement in Transformers","summary":" Medical image segmentation is a critical task that plays a vital role in\ndiagnosis, treatment planning, and disease monitoring. Accurate segmentation of\nanatomical structures and abnormalities from medical images can aid in the\nearly detection and treatment of various diseases. In this paper, we address\nthe local feature deficiency of the Transformer model by carefully re-designing\nthe self-attention map to produce accurate dense prediction in medical images.\nTo this end, we first apply the wavelet transformation to decompose the input\nfeature map into low-frequency (LF) and high-frequency (HF) subbands. The LF\nsegment is associated with coarse-grained features while the HF components\npreserve fine-grained features such as texture and edge information. Next, we\nreformulate the self-attention operation using the efficient Transformer to\nperform both spatial and context attention on top of the frequency\nrepresentation. Furthermore, to intensify the importance of the boundary\ninformation, we impose an additional attention map by creating a Gaussian\npyramid on top of the HF components. Moreover, we propose a multi-scale context\nenhancement block within skip connections to adaptively model inter-scale\ndependencies to overcome the semantic gap among stages of the encoder and\ndecoder modules. Throughout comprehensive experiments, we demonstrate the\neffectiveness of our strategy on multi-organ and skin lesion segmentation\nbenchmarks. The implementation code will be available upon acceptance.\n\\href{https://github.com/mindflow-institue/WaveFormer}{GitHub}.\n","authors":["Reza Azad","Amirhossein Kazerouni","Alaa Sulaiman","Afshin Bozorgpour","Ehsan Khodapanah Aghdam","Abin Jose","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2308.13442v2.pdf","comment":"Accepted in MICCAI 2023 workshop MLMI"},{"id":"http://arxiv.org/abs/2309.06511v1","updated":"2023-09-12T18:37:05Z","published":"2023-09-12T18:37:05Z","title":"DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio\n Cross-Attention and Facial Self-Attention","summary":" With the rise in manipulated media, deepfake detection has become an\nimperative task for preserving the authenticity of digital content. In this\npaper, we present a novel multi-modal audio-video framework designed to\nconcurrently process audio and video inputs for deepfake detection tasks. Our\nmodel capitalizes on lip synchronization with input audio through a\ncross-attention mechanism while extracting visual cues via a fine-tuned VGG-16\nnetwork. Subsequently, a transformer encoder network is employed to perform\nfacial self-attention. We conduct multiple ablation studies highlighting\ndifferent strengths of our approach. Our multi-modal methodology outperforms\nstate-of-the-art multi-modal deepfake detection techniques in terms of F-1 and\nper-video AUC scores.\n","authors":["Aaditya Kharel","Manas Paranjape","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2309.06511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05071v2","updated":"2023-09-12T18:28:40Z","published":"2023-09-10T16:32:02Z","title":"Super-Resolution Surface Reconstruction from Few Low-Resolution Slices","summary":" In many imaging applications where segmented features (e.g. blood vessels)\nare further used for other numerical simulations (e.g. finite element\nanalysis), the obtained surfaces do not have fine resolutions suitable for the\ntask. Increasing the resolution of such surfaces becomes crucial. This paper\nproposes a new variational model for solving this problem, based on an\nEuler-Elastica-based regulariser. Further, we propose and implement two\nnumerical algorithms for solving the model, a projected gradient descent method\nand the alternating direction method of multipliers. Numerical experiments\nusing real-life examples (including two from outputs of another variational\nmodel) have been illustrated for effectiveness. The advantages of the new model\nare shown through quantitative comparisons by the standard deviation of\nGaussian curvatures and mean curvatures from the viewpoint of discrete\ngeometry.\n","authors":["Yiyao Zhang","Ke Chen","Shang-Hua Yang"],"pdf_url":"https://arxiv.org/pdf/2309.05071v2.pdf","comment":"33 pages, 25 figures"},{"id":"http://arxiv.org/abs/2309.06462v1","updated":"2023-09-12T17:56:06Z","published":"2023-09-12T17:56:06Z","title":"Action Segmentation Using 2D Skeleton Heatmaps","summary":" This paper presents a 2D skeleton-based action segmentation method with\napplications in fine-grained human activity recognition. In contrast with\nstate-of-the-art methods which directly take sequences of 3D skeleton\ncoordinates as inputs and apply Graph Convolutional Networks (GCNs) for\nspatiotemporal feature learning, our main idea is to use sequences of 2D\nskeleton heatmaps as inputs and employ Temporal Convolutional Networks (TCNs)\nto extract spatiotemporal features. Despite lacking 3D information, our\napproach yields comparable/superior performances and better robustness against\nmissing keypoints than previous methods on action segmentation datasets.\nMoreover, we improve the performances further by using both 2D skeleton\nheatmaps and RGB videos as inputs. To our best knowledge, this is the first\nwork to utilize 2D skeleton heatmap inputs and the first work to explore 2D\nskeleton+RGB fusion for action segmentation.\n","authors":["Syed Waleed Hyder","Muhammad Usama","Anas Zafar","Muhammad Naufil","Andrey Konin","M. Zeeshan Zia","Quoc-Huy Tran"],"pdf_url":"https://arxiv.org/pdf/2309.06462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04573v2","updated":"2023-09-12T14:36:02Z","published":"2023-09-08T20:07:18Z","title":"Mask2Anomaly: Mask Transformer for Universal Open-set Segmentation","summary":" Segmenting unknown or anomalous object instances is a critical task in\nautonomous driving applications, and it is approached traditionally as a\nper-pixel classification problem. However, reasoning individually about each\npixel without considering their contextual semantics results in high\nuncertainty around the objects' boundaries and numerous false positives. We\npropose a paradigm change by shifting from a per-pixel classification to a mask\nclassification. Our mask-based method, Mask2Anomaly, demonstrates the\nfeasibility of integrating a mask-classification architecture to jointly\naddress anomaly segmentation, open-set semantic segmentation, and open-set\npanoptic segmentation. Mask2Anomaly includes several technical novelties that\nare designed to improve the detection of anomalies/unknown objects: i) a global\nmasked attention module to focus individually on the foreground and background\nregions; ii) a mask contrastive learning that maximizes the margin between an\nanomaly and known classes; iii) a mask refinement solution to reduce false\npositives; and iv) a novel approach to mine unknown instances based on the\nmask-architecture properties. By comprehensive qualitative and qualitative\nevaluation, we show Mask2Anomaly achieves new state-of-the-art results across\nthe benchmarks of anomaly segmentation, open-set semantic segmentation, and\nopen-set panoptic segmentation.\n","authors":["Shyam Nandan Rai","Fabio Cermelli","Barbara Caputo","Carlo Masone"],"pdf_url":"https://arxiv.org/pdf/2309.04573v2.pdf","comment":"16 pages. arXiv admin note: substantial text overlap with\n arXiv:2307.13316"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2202.02757v3","updated":"2023-09-12T13:59:20Z","published":"2022-02-06T11:05:47Z","title":"A Review of Modern Fashion Recommender Systems","summary":" The textile and apparel industries have grown tremendously over the last few\nyears. Customers no longer have to visit many stores, stand in long queues, or\ntry on garments in dressing rooms as millions of products are now available in\nonline catalogs. However, given the plethora of options available, an effective\nrecommendation system is necessary to properly sort, order, and communicate\nrelevant product material or information to users. Effective fashion RS can\nhave a noticeable impact on billions of customers' shopping experiences and\nincrease sales and revenues on the provider side. The goal of this survey is to\nprovide a review of recommender systems that operate in the specific vertical\ndomain of garment and fashion products. We have identified the most pressing\nchallenges in fashion RS research and created a taxonomy that categorizes the\nliterature according to the objective they are trying to accomplish (e.g., item\nor outfit recommendation, size recommendation, explainability, among others)\nand type of side-information (users, items, context). We have also identified\nthe most important evaluation goals and perspectives (outfit generation, outfit\nrecommendation, pairing recommendation, and fill-in-the-blank outfit\ncompatibility prediction) and the most commonly used datasets and evaluation\nmetrics.\n","authors":["Yashar Deldjoo","Fatemeh Nazary","Arnau Ramisa","Julian Mcauley","Giovanni Pellegrini","Alejandro Bellogin","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2202.02757v3.pdf","comment":"38 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.06219v1","updated":"2023-09-12T13:38:44Z","published":"2023-09-12T13:38:44Z","title":"Human Action Co-occurrence in Lifestyle Vlogs using Graph Link\n Prediction","summary":" We introduce the task of automatic human action co-occurrence identification,\ni.e., determine whether two human actions can co-occur in the same interval of\ntime. We create and make publicly available the ACE (Action Co-occurrencE)\ndataset, consisting of a large graph of ~12k co-occurring pairs of visual\nactions and their corresponding video clips. We describe graph link prediction\nmodels that leverage visual and textual information to automatically infer if\ntwo actions are co-occurring. We show that graphs are particularly well suited\nto capture relations between human actions, and the learned graph\nrepresentations are effective for our task and capture novel and relevant\ninformation across different data domains. The ACE dataset and the code\nintroduced in this paper are publicly available at\nhttps://github.com/MichiganNLP/vlog_action_co-occurrence.\n","authors":["Oana Ignat","Santiago Castro","Weiji Li","Rada Mihalcea"],"pdf_url":"https://arxiv.org/pdf/2309.06219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06217v1","updated":"2023-09-12T13:34:33Z","published":"2023-09-12T13:34:33Z","title":"HAMUR: Hyper Adapter for Multi-Domain Recommendation","summary":" Multi-Domain Recommendation (MDR) has gained significant attention in recent\nyears, which leverages data from multiple domains to enhance their performance\nconcurrently.However, current MDR models are confronted with two limitations.\nFirstly, the majority of these models adopt an approach that explicitly shares\nparameters between domains, leading to mutual interference among them.\nSecondly, due to the distribution differences among domains, the utilization of\nstatic parameters in existing methods limits their flexibility to adapt to\ndiverse domains. To address these challenges, we propose a novel model Hyper\nAdapter for Multi-Domain Recommendation (HAMUR). Specifically, HAMUR consists\nof two components: (1). Domain-specific adapter, designed as a pluggable module\nthat can be seamlessly integrated into various existing multi-domain backbone\nmodels, and (2). Domain-shared hyper-network, which implicitly captures shared\ninformation among domains and dynamically generates the parameters for the\nadapter. We conduct extensive experiments on two public datasets using various\nbackbone networks. The experimental results validate the effectiveness and\nscalability of the proposed model.\n","authors":["Xiaopeng Li","Fan Yan","Xiangyu Zhao","Yichao Wang","Bo Chen","Huifeng Guo","Ruiming Tang"],"pdf_url":"https://arxiv.org/pdf/2309.06217v1.pdf","comment":"Accepted by CIKM'2023"},{"id":"http://arxiv.org/abs/2309.06192v1","updated":"2023-09-12T13:01:20Z","published":"2023-09-12T13:01:20Z","title":"Improving and Evaluating the Detection of Fragmentation in News\n Recommendations with the Clustering of News Story Chains","summary":" News recommender systems play an increasingly influential role in shaping\ninformation access within democratic societies. However, tailoring\nrecommendations to users' specific interests can result in the divergence of\ninformation streams. Fragmented access to information poses challenges to the\nintegrity of the public sphere, thereby influencing democracy and public\ndiscourse. The Fragmentation metric quantifies the degree of fragmentation of\ninformation streams in news recommendations. Accurate measurement of this\nmetric requires the application of Natural Language Processing (NLP) to\nidentify distinct news events, stories, or timelines. This paper presents an\nextensive investigation of various approaches for quantifying Fragmentation in\nnews recommendations. These approaches are evaluated both intrinsically, by\nmeasuring performance on news story clustering, and extrinsically, by assessing\nthe Fragmentation scores of different simulated news recommender scenarios. Our\nfindings demonstrate that agglomerative hierarchical clustering coupled with\nSentenceBERT text representation is substantially better at detecting\nFragmentation than earlier implementations. Additionally, the analysis of\nsimulated scenarios yields valuable insights and recommendations for\nstakeholders concerning the measurement and interpretation of Fragmentation.\n","authors":["Alessandra Polimeno","Myrthe Reuver","Sanne Vrijenhoek","Antske Fokkens"],"pdf_url":"https://arxiv.org/pdf/2309.06192v1.pdf","comment":"Cite published version: Polimeno et. al., Improving and Evaluating\n the Detection of Fragmentation in News Recommendations with the Clustering of\n News Story Chains, NORMalize 2023: The First Workshop on the Normative Design\n and Evaluation of Recommender Systems, September 19, 2023, co-located with\n the ACM Conference on Recommender Systems 2023 (RecSys 2023), Singapore"},{"id":"http://arxiv.org/abs/2309.06175v1","updated":"2023-09-12T12:37:37Z","published":"2023-09-12T12:37:37Z","title":"AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity\n Recognition and Linking","summary":" This paper presents a novel approach to address the Entity Recognition and\nLinking Challenge at NLPCC 2015. The task involves extracting named entity\nmentions from short search queries and linking them to entities within a\nreference Chinese knowledge base. To tackle this problem, we first expand the\nexisting knowledge base and utilize external knowledge to identify candidate\nentities, thereby improving the recall rate. Next, we extract features from the\ncandidate entities and utilize Support Vector Regression and Multiple Additive\nRegression Tree as scoring functions to filter the results. Additionally, we\napply rules to further refine the results and enhance precision. Our method is\ncomputationally efficient and achieves an F1 score of 0.535.\n","authors":["Di Lu","Zhongping Liang","Caixia Yuan","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06131v1","updated":"2023-09-12T11:17:42Z","published":"2023-09-12T11:17:42Z","title":"Annotating Data for Fine-Tuning a Neural Ranker? Current Active Learning\n Strategies are not Better than Random Selection","summary":" Search methods based on Pretrained Language Models (PLM) have demonstrated\ngreat effectiveness gains compared to statistical and early neural ranking\nmodels. However, fine-tuning PLM-based rankers requires a great amount of\nannotated training data. Annotating data involves a large manual effort and\nthus is expensive, especially in domain specific tasks. In this paper we\ninvestigate fine-tuning PLM-based rankers under limited training data and\nbudget. We investigate two scenarios: fine-tuning a ranker from scratch, and\ndomain adaptation starting with a ranker already fine-tuned on general data,\nand continuing fine-tuning on a target dataset. We observe a great variability\nin effectiveness when fine-tuning on different randomly selected subsets of\ntraining data. This suggests that it is possible to achieve effectiveness gains\nby actively selecting a subset of the training data that has the most positive\neffect on the rankers. This way, it would be possible to fine-tune effective\nPLM rankers at a reduced annotation budget. To investigate this, we adapt\nexisting Active Learning (AL) strategies to the task of fine-tuning PLM rankers\nand investigate their effectiveness, also considering annotation and\ncomputational costs. Our extensive analysis shows that AL strategies do not\nsignificantly outperform random selection of training subsets in terms of\neffectiveness. We further find that gains provided by AL strategies come at the\nexpense of more assessments (thus higher annotation costs) and AL strategies\nunderperform random selection when comparing effectiveness given a fixed\nannotation cost. Our results highlight that ``optimal'' subsets of training\ndata that provide high effectiveness at low annotation cost do exist, but\ncurrent mainstream AL strategies applied to PLM rankers are not capable of\nidentifying them.\n","authors":["Sophia Althammer","Guido Zuccon","Sebastian Hofstätter","Suzan Verberne","Allan Hanbury"],"pdf_url":"https://arxiv.org/pdf/2309.06131v1.pdf","comment":"Accepted at SIGIR-AP 2023"},{"id":"http://arxiv.org/abs/2309.06112v1","updated":"2023-09-12T10:27:39Z","published":"2023-09-12T10:27:39Z","title":"Characterizing Latent Perspectives of Media Houses Towards Public\n Figures","summary":" Media houses reporting on public figures, often come with their own biases\nstemming from their respective worldviews. A characterization of these\nunderlying patterns helps us in better understanding and interpreting news\nstories. For this, we need diverse or subjective summarizations, which may not\nbe amenable for classifying into predefined class labels. This work proposes a\nzero-shot approach for non-extractive or generative characterizations of person\nentities from a corpus using GPT-2. We use well-articulated articles from\nseveral well-known news media houses as a corpus to build a sound argument for\nthis approach. First, we fine-tune a GPT-2 pre-trained language model with a\ncorpus where specific person entities are characterized. Second, we further\nfine-tune this with demonstrations of person entity characterizations, created\nfrom a corpus of programmatically constructed characterizations. This twice\nfine-tuned model is primed with manual prompts consisting of entity names that\nwere not previously encountered in the second fine-tuning, to generate a simple\nsentence about the entity. The results were encouraging, when compared against\nactual characterizations from the corpus.\n","authors":["Sharath Srivatsa","Srinath Srinivasa"],"pdf_url":"https://arxiv.org/pdf/2309.06112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10711v3","updated":"2023-09-12T09:33:03Z","published":"2023-04-21T02:48:29Z","title":"EulerNet: Adaptive Feature Interaction Learning via Euler's Formula for\n CTR Prediction","summary":" Learning effective high-order feature interactions is very crucial in the CTR\nprediction task. However, it is very time-consuming to calculate high-order\nfeature interactions with massive features in online e-commerce platforms. Most\nexisting methods manually design a maximal order and further filter out the\nuseless interactions from them. Although they reduce the high computational\ncosts caused by the exponential growth of high-order feature combinations, they\nstill suffer from the degradation of model capability due to the suboptimal\nlearning of the restricted feature orders. The solution to maintain the model\ncapability and meanwhile keep it efficient is a technical challenge, which has\nnot been adequately addressed. To address this issue, we propose an adaptive\nfeature interaction learning model, named as EulerNet, in which the feature\ninteractions are learned in a complex vector space by conducting space mapping\naccording to Euler's formula. EulerNet converts the exponential powers of\nfeature interactions into simple linear combinations of the modulus and phase\nof the complex features, making it possible to adaptively learn the high-order\nfeature interactions in an efficient way. Furthermore, EulerNet incorporates\nthe implicit and explicit feature interactions into a unified architecture,\nwhich achieves the mutual enhancement and largely boosts the model\ncapabilities. Such a network can be fully learned from data, with no need of\npre-designed form or order for feature interactions. Extensive experiments\nconducted on three public datasets have demonstrated the effectiveness and\nefficiency of our approach. Our code is available at:\nhttps://github.com/RUCAIBox/EulerNet.\n","authors":["Zhen Tian","Ting Bai","Wayne Xin Zhao","Ji-Rong Wen","Zhao Cao"],"pdf_url":"https://arxiv.org/pdf/2304.10711v3.pdf","comment":"10 pages, 7 figures, accepted for publication in SIGIR'23"},{"id":"http://arxiv.org/abs/2309.05961v1","updated":"2023-09-12T05:03:28Z","published":"2023-09-12T05:03:28Z","title":"Evaluating the Ebb and Flow: An In-depth Analysis of Question-Answering\n Trends across Diverse Platforms","summary":" Community Question Answering (CQA) platforms steadily gain popularity as they\nprovide users with fast responses to their queries. The swiftness of these\nresponses is contingent on a mixture of query-specific and user-related\nelements. This paper scrutinizes these contributing factors within the context\nof six highly popular CQA platforms, identified through their standout\nanswering speed. Our investigation reveals a correlation between the time taken\nto yield the first response to a question and several variables: the metadata,\nthe formulation of the questions, and the level of interaction among users.\nAdditionally, by employing conventional machine learning models to analyze\nthese metadata and patterns of user interaction, we endeavor to predict which\nqueries will receive their initial responses promptly.\n","authors":["Rima Hazra","Agnik Saha","Somnath Banerjee","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2309.05961v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2304.07041v3","updated":"2023-09-12T04:51:25Z","published":"2023-04-14T10:29:18Z","title":"A Diffusion model for POI recommendation","summary":" Next Point-of-Interest (POI) recommendation is a critical task in\nlocation-based services that aim to provide personalized suggestions for the\nuser's next destination. Previous works on POI recommendation have laid focused\non modeling the user's spatial preference. However, existing works that\nleverage spatial information are only based on the aggregation of users'\nprevious visited positions, which discourages the model from recommending POIs\nin novel areas. This trait of position-based methods will harm the model's\nperformance in many situations. Additionally, incorporating sequential\ninformation into the user's spatial preference remains a challenge. In this\npaper, we propose Diff-POI: a Diffusion-based model that samples the user's\nspatial preference for the next POI recommendation. Inspired by the wide\napplication of diffusion algorithm in sampling from distributions, Diff-POI\nencodes the user's visiting sequence and spatial character with two\ntailor-designed graph encoding modules, followed by a diffusion-based sampling\nstrategy to explore the user's spatial visiting trends. We leverage the\ndiffusion process and its reversed form to sample from the posterior\ndistribution and optimized the corresponding score function. We design a joint\ntraining and inference framework to optimize and evaluate the proposed\nDiff-POI. Extensive experiments on four real-world POI recommendation datasets\ndemonstrate the superiority of our Diff-POI over state-of-the-art baseline\nmethods. Further ablation and parameter studies on Diff-POI reveal the\nfunctionality and effectiveness of the proposed diffusion-based sampling\nstrategy for addressing the limitations of existing methods.\n","authors":["Yifang Qin","Hongjun Wu","Wei Ju","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.07041v3.pdf","comment":"Accepted by ACM Transactions on Information Systems (TOIS 2023)"},{"id":"http://arxiv.org/abs/2309.05953v1","updated":"2023-09-12T04:21:30Z","published":"2023-09-12T04:21:30Z","title":"GLAD: Content-aware Dynamic Graphs For Log Anomaly Detection","summary":" Logs play a crucial role in system monitoring and debugging by recording\nvaluable system information, including events and states. Although various\nmethods have been proposed to detect anomalies in log sequences, they often\noverlook the significance of considering relations among system components,\nsuch as services and users, which can be identified from log contents.\nUnderstanding these relations is vital for detecting anomalies and their\nunderlying causes. To address this issue, we introduce GLAD, a Graph-based Log\nAnomaly Detection framework designed to detect relational anomalies in system\nlogs. GLAD incorporates log semantics, relational patterns, and sequential\npatterns into a unified framework for anomaly detection. Specifically, GLAD\nfirst introduces a field extraction module that utilizes prompt-based few-shot\nlearning to identify essential fields from log contents. Then GLAD constructs\ndynamic log graphs for sliding windows by interconnecting extracted fields and\nlog events parsed from the log parser. These graphs represent events and fields\nas nodes and their relations as edges. Subsequently, GLAD utilizes a\ntemporal-attentive graph edge anomaly detection model for identifying anomalous\nrelations in these dynamic log graphs. This model employs a Graph Neural\nNetwork (GNN)-based encoder enhanced with transformers to capture content,\nstructural and temporal features. We evaluate our proposed method on three\ndatasets, and the results demonstrate the effectiveness of GLAD in detecting\nanomalies indicated by varying relational patterns.\n","authors":["Yufei Li","Yanchi Liu","Haoyu Wang","Zhengzhang Chen","Wei Cheng","Yuncong Chen","Wenchao Yu","Haifeng Chen","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05953v1.pdf","comment":"Accepted by ICKG 2023"},{"id":"http://arxiv.org/abs/2309.05922v1","updated":"2023-09-12T02:34:06Z","published":"2023-09-12T02:34:06Z","title":"A Survey of Hallucination in Large Foundation Models","summary":" Hallucination in a foundation model (FM) refers to the generation of content\nthat strays from factual reality or includes fabricated information. This\nsurvey paper provides an extensive overview of recent efforts that aim to\nidentify, elucidate, and tackle the problem of hallucination, with a particular\nfocus on ``Large'' Foundation Models (LFMs). The paper classifies various types\nof hallucination phenomena that are specific to LFMs and establishes evaluation\ncriteria for assessing the extent of hallucination. It also examines existing\nstrategies for mitigating hallucination in LFMs and discusses potential\ndirections for future research in this area. Essentially, the paper offers a\ncomprehensive examination of the challenges and solutions related to\nhallucination in LFMs.\n","authors":["Vipula Rawte","Amit Sheth","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2309.05922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05920v1","updated":"2023-09-12T02:24:16Z","published":"2023-09-12T02:24:16Z","title":"SAGE: Structured Attribute Value Generation for Billion-Scale Product\n Catalogs","summary":" We introduce SAGE; a Generative LLM for inferring attribute values for\nproducts across world-wide e-Commerce catalogs. We introduce a novel\nformulation of the attribute-value prediction problem as a Seq2Seq\nsummarization task, across languages, product types and target attributes. Our\nnovel modeling approach lifts the restriction of predicting attribute values\nwithin a pre-specified set of choices, as well as, the requirement that the\nsought attribute values need to be explicitly mentioned in the text. SAGE can\ninfer attribute values even when such values are mentioned implicitly using\nperiphrastic language, or not-at-all-as is the case for common-sense defaults.\nAdditionally, SAGE is capable of predicting whether an attribute is\ninapplicable for the product at hand, or non-obtainable from the available\ninformation. SAGE is the first method able to tackle all aspects of the\nattribute-value-prediction task as they arise in practical settings in\ne-Commerce catalogs. A comprehensive set of experiments demonstrates the\neffectiveness of the proposed approach, as well as, its superiority against\nstate-of-the-art competing alternatives. Moreover, our experiments highlight\nSAGE's ability to tackle the task of predicting attribute values in zero-shot\nsetting; thereby, opening up opportunities for significantly reducing the\noverall number of labeled examples required for training.\n","authors":["Athanasios N. Nikolakopoulos","Swati Kaul","Siva Karthik Gade","Bella Dubrov","Umit Batur","Suleiman Ali Khan"],"pdf_url":"https://arxiv.org/pdf/2309.05920v1.pdf","comment":"(17 pages)"},{"id":"http://arxiv.org/abs/2309.05892v1","updated":"2023-09-12T00:34:02Z","published":"2023-09-12T00:34:02Z","title":"Distributionally-Informed Recommender System Evaluation","summary":" Current practice for evaluating recommender systems typically focuses on\npoint estimates of user-oriented effectiveness metrics or business metrics,\nsometimes combined with additional metrics for considerations such as diversity\nand novelty. In this paper, we argue for the need for researchers and\npractitioners to attend more closely to various distributions that arise from a\nrecommender system (or other information access system) and the sources of\nuncertainty that lead to these distributions. One immediate implication of our\nargument is that both researchers and practitioners must report and examine\nmore thoroughly the distribution of utility between and within different\nstakeholder groups. However, distributions of various forms arise in many more\naspects of the recommender systems experimental process, and distributional\nthinking has substantial ramifications for how we design, evaluate, and present\nrecommender systems evaluation and research results. Leveraging and emphasizing\ndistributions in the evaluation of recommender systems is a necessary step to\nensure that the systems provide appropriate and equitably-distributed benefit\nto the people they affect.\n","authors":["Michael D. Ekstrand","Ben Carterette","Fernando Diaz"],"pdf_url":"https://arxiv.org/pdf/2309.05892v1.pdf","comment":"Accepted to ACM Transactions on Recommender Systems"},{"id":"http://arxiv.org/abs/2302.03735v3","updated":"2023-09-12T23:01:07Z","published":"2023-02-07T20:12:59Z","title":"Pre-train, Prompt and Recommendation: A Comprehensive Survey of Language\n Modelling Paradigm Adaptations in Recommender Systems","summary":" The emergence of Pre-trained Language Models (PLMs) has achieved tremendous\nsuccess in the field of Natural Language Processing (NLP) by learning universal\nrepresentations on large corpora in a self-supervised manner. The pre-trained\nmodels and the learned representations can be beneficial to a series of\ndownstream NLP tasks. This training paradigm has recently been adapted to the\nrecommendation domain and is considered a promising approach by both academia\nand industry. In this paper, we systematically investigate how to extract and\ntransfer knowledge from pre-trained models learned by different PLM-related\ntraining paradigms to improve recommendation performance from various\nperspectives, such as generality, sparsity, efficiency and effectiveness.\nSpecifically, we propose a comprehensive taxonomy to divide existing PLM-based\nrecommender systems w.r.t. their training strategies and objectives. Then, we\nanalyze and summarize the connection between PLM-based training paradigms and\ndifferent input data types for recommender systems. Finally, we elaborate on\nopen issues and future research directions in this vibrant field.\n","authors":["Peng Liu","Lemei Zhang","Jon Atle Gulla"],"pdf_url":"https://arxiv.org/pdf/2302.03735v3.pdf","comment":"Accepted for publication at Transactions of the Association for\n Computational Linguistics (TACL) in September 2023"},{"id":"http://arxiv.org/abs/2309.06533v1","updated":"2023-09-12T19:11:34Z","published":"2023-09-12T19:11:34Z","title":"Hierarchical Multi-Task Learning Framework for Session-based\n Recommendations","summary":" While session-based recommender systems (SBRSs) have shown superior\nrecommendation performance, multi-task learning (MTL) has been adopted by SBRSs\nto enhance their prediction accuracy and generalizability further. Hierarchical\nMTL (H-MTL) sets a hierarchical structure between prediction tasks and feeds\noutputs from auxiliary tasks to main tasks. This hierarchy leads to richer\ninput features for main tasks and higher interpretability of predictions,\ncompared to existing MTL frameworks. However, the H-MTL framework has not been\ninvestigated in SBRSs yet. In this paper, we propose HierSRec which\nincorporates the H-MTL architecture into SBRSs. HierSRec encodes a given\nsession with a metadata-aware Transformer and performs next-category prediction\n(i.e., auxiliary task) with the session encoding. Next, HierSRec conducts\nnext-item prediction (i.e., main task) with the category prediction result and\nsession encoding. For scalable inference, HierSRec creates a compact set of\ncandidate items (e.g., 4% of total items) per test example using the category\nprediction. Experiments show that HierSRec outperforms existing SBRSs as per\nnext-item prediction accuracy on two session-based recommendation datasets. The\naccuracy of HierSRec measured with the carefully-curated candidate items aligns\nwith the accuracy of HierSRec calculated with all items, which validates the\nusefulness of our candidate generation scheme via H-MTL.\n","authors":["Sejoon Oh","Walid Shalaby","Amir Afsharinejad","Xiquan Cui"],"pdf_url":"https://arxiv.org/pdf/2309.06533v1.pdf","comment":"Accepted at the 6th Workshop on Online Recommender Systems and User\n Modeling @ ACM RecSys 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.06440v1","updated":"2023-09-12T17:59:20Z","published":"2023-09-12T17:59:20Z","title":"LEAP Hand: Low-Cost, Efficient, and Anthropomorphic Hand for Robot\n Learning","summary":" Dexterous manipulation has been a long-standing challenge in robotics. While\nmachine learning techniques have shown some promise, results have largely been\ncurrently limited to simulation. This can be mostly attributed to the lack of\nsuitable hardware. In this paper, we present LEAP Hand, a low-cost dexterous\nand anthropomorphic hand for machine learning research. In contrast to previous\nhands, LEAP Hand has a novel kinematic structure that allows maximal dexterity\nregardless of finger pose. LEAP Hand is low-cost and can be assembled in 4\nhours at a cost of 2000 USD from readily available parts. It is capable of\nconsistently exerting large torques over long durations of time. We show that\nLEAP Hand can be used to perform several manipulation tasks in the real world\n-- from visual teleoperation to learning from passive video data and sim2real.\nLEAP Hand significantly outperforms its closest competitor Allegro Hand in all\nour experiments while being 1/8th of the cost. We release detailed assembly\ninstructions, the Sim2Real pipeline and a development platform with useful APIs\non our website at https://leap-hand.github.io/\n","authors":["Kenneth Shaw","Ananye Agarwal","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2309.06440v1.pdf","comment":"Website at https://leap-hand.github.io/"},{"id":"http://arxiv.org/abs/2309.06424v1","updated":"2023-09-12T17:40:49Z","published":"2023-09-12T17:40:49Z","title":"Unveiling the potential of large language models in generating semantic\n and cross-language clones","summary":" Semantic and Cross-language code clone generation may be useful for code\nreuse, code comprehension, refactoring and benchmarking. OpenAI's GPT model has\npotential in such clone generation as GPT is used for text generation. When\ndevelopers copy/paste codes from Stack Overflow (SO) or within a system, there\nmight be inconsistent changes leading to unexpected behaviours. Similarly, if\nsomeone possesses a code snippet in a particular programming language but seeks\nequivalent functionality in a different language, a semantic cross-language\ncode clone generation approach could provide valuable assistance.In this study,\nusing SemanticCloneBench as a vehicle, we evaluated how well the GPT-3 model\ncould help generate semantic and cross-language clone variants for a given\nfragment.We have comprised a diverse set of code fragments and assessed GPT-3s\nperformance in generating code variants.Through extensive experimentation and\nanalysis, where 9 judges spent 158 hours to validate, we investigate the\nmodel's ability to produce accurate and semantically correct variants. Our\nfindings shed light on GPT-3's strengths in code generation, offering insights\ninto the potential applications and challenges of using advanced language\nmodels in software development. Our quantitative analysis yields compelling\nresults. In the realm of semantic clones, GPT-3 attains an impressive accuracy\nof 62.14% and 0.55 BLEU score, achieved through few-shot prompt engineering.\nFurthermore, the model shines in transcending linguistic confines, boasting an\nexceptional 91.25% accuracy in generating cross-language clones\n","authors":["Palash R. Roy","Ajmain I. Alam","Farouq Al-omari","Banani Roy","Chanchal K. Roy","Kevin A. Schneider"],"pdf_url":"https://arxiv.org/pdf/2309.06424v1.pdf","comment":"Accepted in IWSC"},{"id":"http://arxiv.org/abs/2309.06413v1","updated":"2023-09-12T17:25:32Z","published":"2023-09-12T17:25:32Z","title":"On Computationally Efficient Learning of Exponential Family\n Distributions","summary":" We consider the classical problem of learning, with arbitrary accuracy, the\nnatural parameters of a $k$-parameter truncated \\textit{minimal} exponential\nfamily from i.i.d. samples in a computationally and statistically efficient\nmanner. We focus on the setting where the support as well as the natural\nparameters are appropriately bounded. While the traditional maximum likelihood\nestimator for this class of exponential family is consistent, asymptotically\nnormal, and asymptotically efficient, evaluating it is computationally hard. In\nthis work, we propose a novel loss function and a computationally efficient\nestimator that is consistent as well as asymptotically normal under mild\nconditions. We show that, at the population level, our method can be viewed as\nthe maximum likelihood estimation of a re-parameterized distribution belonging\nto the same class of exponential family. Further, we show that our estimator\ncan be interpreted as a solution to minimizing a particular Bregman score as\nwell as an instance of minimizing the \\textit{surrogate} likelihood. We also\nprovide finite sample guarantees to achieve an error (in $\\ell_2$-norm) of\n$\\alpha$ in the parameter estimation with sample complexity $O({\\sf\npoly}(k)/\\alpha^2)$. Our method achives the order-optimal sample complexity of\n$O({\\sf log}(k)/\\alpha^2)$ when tailored for node-wise-sparse Markov random\nfields. Finally, we demonstrate the performance of our estimator via numerical\nexperiments.\n","authors":["Abhin Shah","Devavrat Shah","Gregory W. Wornell"],"pdf_url":"https://arxiv.org/pdf/2309.06413v1.pdf","comment":"An earlier version of this work arXiv:2110.15397 was presented at the\n Neural Information Processing Systems Conference in December 2021 titled \"A\n Computationally Efficient Method for Learning Exponential Family\n Distributions\""},{"id":"http://arxiv.org/abs/2203.01881v5","updated":"2023-09-12T16:57:06Z","published":"2022-03-03T17:48:23Z","title":"Measuring Self-Supervised Representation Quality for Downstream\n Classification using Discriminative Features","summary":" Self-supervised learning (SSL) has shown impressive results in downstream\nclassification tasks. However, there is limited work in understanding their\nfailure modes and interpreting their learned representations. In this paper, we\nstudy the representation space of state-of-the-art self-supervised models\nincluding SimCLR, SwaV, MoCo, BYOL, DINO, SimSiam, VICReg and Barlow Twins.\nWithout the use of class label information, we discover discriminative features\nthat correspond to unique physical attributes in images, present mostly in\ncorrectly-classified representations. Using these features, we can compress the\nrepresentation space by up to 40% without significantly affecting linear\nclassification performance. We then propose Self-Supervised Representation\nQuality Score (or Q-Score), an unsupervised score that can reliably predict if\na given sample is likely to be mis-classified during linear evaluation,\nachieving AUPRC of 91.45 on ImageNet-100 and 78.78 on ImageNet-1K. Q-Score can\nalso be used as a regularization term on pre-trained encoders to remedy\nlow-quality representations. Fine-tuning with Q-Score regularization can boost\nthe linear probing accuracy of SSL models by up to 5.8% on ImageNet-100 and\n3.7% on ImageNet-1K compared to their baselines. Finally, using gradient\nheatmaps and Salient ImageNet masks, we define a metric to quantify the\ninterpretability of each representation. We show that discriminative features\nare strongly correlated to core attributes and, enhancing these features\nthrough Q-score regularization makes SSL representations more interpretable.\n","authors":["Neha Kalibhat","Kanika Narang","Hamed Firooz","Maziar Sanjabi","Soheil Feizi"],"pdf_url":"https://arxiv.org/pdf/2203.01881v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06382v1","updated":"2023-09-12T16:48:00Z","published":"2023-09-12T16:48:00Z","title":"Ensemble Mask Networks","summary":" Can an $\\mathbb{R}^n\\rightarrow \\mathbb{R}^n$ feedforward network learn\nmatrix-vector multiplication? This study introduces two mechanisms - flexible\nmasking to take matrix inputs, and a unique network pruning to respect the\nmask's dependency structure. Networks can approximate fixed operations such as\nmatrix-vector multiplication $\\phi(A,x) \\rightarrow Ax$, motivating the\nmechanisms introduced with applications towards litmus-testing dependencies or\ninteraction order in graph-based models.\n","authors":["Jonny Luntzel"],"pdf_url":"https://arxiv.org/pdf/2309.06382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06380v1","updated":"2023-09-12T16:42:09Z","published":"2023-09-12T16:42:09Z","title":"InstaFlow: One Step is Enough for High-Quality Diffusion-Based\n Text-to-Image Generation","summary":" Diffusion models have revolutionized text-to-image generation with its\nexceptional quality and creativity. However, its multi-step sampling process is\nknown to be slow, often requiring tens of inference steps to obtain\nsatisfactory results. Previous attempts to improve its sampling speed and\nreduce computational costs through distillation have been unsuccessful in\nachieving a functional one-step model. In this paper, we explore a recent\nmethod called Rectified Flow, which, thus far, has only been applied to small\ndatasets. The core of Rectified Flow lies in its \\emph{reflow} procedure, which\nstraightens the trajectories of probability flows, refines the coupling between\nnoises and images, and facilitates the distillation process with student\nmodels. We propose a novel text-conditioned pipeline to turn Stable Diffusion\n(SD) into an ultra-fast one-step model, in which we find reflow plays a\ncritical role in improving the assignment between noise and images. Leveraging\nour new pipeline, we create, to the best of our knowledge, the first one-step\ndiffusion-based text-to-image generator with SD-level image quality, achieving\nan FID (Frechet Inception Distance) of $23.3$ on MS COCO 2017-5k, surpassing\nthe previous state-of-the-art technique, progressive distillation, by a\nsignificant margin ($37.2$ $\\rightarrow$ $23.3$ in FID). By utilizing an\nexpanded network with 1.7B parameters, we further improve the FID to $22.4$. We\ncall our one-step models \\emph{InstaFlow}. On MS COCO 2014-30k, InstaFlow\nyields an FID of $13.1$ in just $0.09$ second, the best in $\\leq 0.1$ second\nregime, outperforming the recent StyleGAN-T ($13.9$ in $0.1$ second). Notably,\nthe training of InstaFlow only costs 199 A100 GPU days. Project\npage:~\\url{https://github.com/gnobitab/InstaFlow}.\n","authors":["Xingchao Liu","Xiwen Zhang","Jianzhu Ma","Jian Peng","Qiang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06380v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.08494v2","updated":"2023-09-12T16:41:48Z","published":"2022-10-16T09:41:23Z","title":"Brand New K-FACs: Speeding up K-FAC with Online Decomposition Updates","summary":" K-FAC (arXiv:1503.05671, arXiv:1602.01407) is a tractable implementation of\nNatural Gradient (NG) for Deep Learning (DL), whose bottleneck is computing the\ninverses of the so-called ``Kronecker-Factors'' (K-factors). RS-KFAC\n(arXiv:2206.15397) is a K-FAC improvement which provides a cheap way of\nestimating the K-factors inverses.\n In this paper, we exploit the exponential-average construction paradigm of\nthe K-factors, and use online numerical linear algebra techniques to propose an\neven cheaper (but less accurate) way of estimating the K-factors inverses. In\nparticular, we propose a K-factor inverse update which scales linearly in layer\nsize. We also propose an inverse application procedure which scales linearly as\nwell (the one of K-FAC scales cubically and the one of RS-KFAC scales\nquadratically). Overall, our proposed algorithm gives an approximate K-FAC\nimplementation whose preconditioning part scales linearly in layer size\n(compare to cubic for K-FAC and quadratic for RS-KFAC). Importantly however,\nthis update is only applicable in some circumstances (typically for all FC\nlayers), unlike the RS-KFAC approach (arXiv:2206.15397).\n Numerical results show RS-KFAC's inversion error can be reduced with minimal\nCPU overhead by adding our proposed update to it. Based on the proposed\nprocedure, a correction to it, and RS-KFAC, we propose three practical\nalgorithms for optimizing generic Deep Neural Nets. Numerical results show that\ntwo of these outperform RS-KFAC for any target test accuracy on CIFAR10\nclassification with a slightly modified version of VGG16_bn. Our proposed\nalgorithms achieve 91$\\%$ test accuracy faster than SENG (the state of art\nimplementation of empirical NG for DL; arXiv:2006.05924) but underperform it\nfor higher test-accuracy.\n","authors":["Constantin Octavian Puiu"],"pdf_url":"https://arxiv.org/pdf/2210.08494v2.pdf","comment":"Version 2 (new numerical experiments coming soon, in V3)"},{"id":"http://arxiv.org/abs/2304.11277v2","updated":"2023-09-12T16:28:00Z","published":"2023-04-21T23:52:27Z","title":"PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel","summary":" It is widely acknowledged that large models have the potential to deliver\nsuperior performance across a broad range of domains. Despite the remarkable\nprogress made in the field of machine learning systems research, which has\nenabled the development and exploration of large models, such abilities remain\nconfined to a small group of advanced users and industry leaders, resulting in\nan implicit technical barrier for the wider community to access and leverage\nthese technologies. In this paper, we introduce PyTorch Fully Sharded Data\nParallel (FSDP) as an industry-grade solution for large model training. FSDP\nhas been closely co-designed with several key PyTorch core components including\nTensor implementation, dispatcher system, and CUDA memory caching allocator, to\nprovide non-intrusive user experiences and high training efficiency.\nAdditionally, FSDP natively incorporates a range of techniques and settings to\noptimize resource utilization across a variety of hardware configurations. The\nexperimental results demonstrate that FSDP is capable of achieving comparable\nperformance to Distributed Data Parallel while providing support for\nsignificantly larger models with near-linear scalability in terms of TFLOPS.\n","authors":["Yanli Zhao","Andrew Gu","Rohan Varma","Liang Luo","Chien-Chin Huang","Min Xu","Less Wright","Hamid Shojanazeri","Myle Ott","Sam Shleifer","Alban Desmaison","Can Balioglu","Pritam Damania","Bernard Nguyen","Geeta Chauhan","Yuchen Hao","Ajit Mathews","Shen Li"],"pdf_url":"https://arxiv.org/pdf/2304.11277v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.01745v3","updated":"2023-09-12T16:23:00Z","published":"2020-09-03T15:20:21Z","title":"GTAdam: Gradient Tracking with Adaptive Momentum for Distributed Online\n Optimization","summary":" This paper deals with a network of computing agents aiming to solve an online\noptimization problem in a distributed fashion, i.e., by means of local\ncomputation and communication, without any central coordinator. We propose the\ngradient tracking with adaptive momentum estimation (GTAdam) distributed\nalgorithm, which combines a gradient tracking mechanism with first and second\norder momentum estimates of the gradient. The algorithm is analyzed in the\nonline setting for strongly convex cost functions with Lipschitz continuous\ngradients. We provide an upper bound for the dynamic regret given by a term\nrelated to the initial conditions and another term related to the temporal\nvariations of the objective functions. Moreover, a linear convergence rate is\nguaranteed in the static setup. The algorithm is tested on a time-varying\nclassification problem, on a (moving) target localization problem, and in a\nstochastic optimization setup from image classification. In these numerical\nexperiments from multi-agent learning, GTAdam outperforms state-of-the-art\ndistributed optimization methods.\n","authors":["Guido Carnevale","Francesco Farina","Ivano Notarnicola","Giuseppe Notarstefano"],"pdf_url":"https://arxiv.org/pdf/2009.01745v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06359v1","updated":"2023-09-12T16:20:20Z","published":"2023-09-12T16:20:20Z","title":"Using Reed-Muller Codes for Classification with Rejection and Recovery","summary":" When deploying classifiers in the real world, users expect them to respond to\ninputs appropriately. However, traditional classifiers are not equipped to\nhandle inputs which lie far from the distribution they were trained on.\nMalicious actors can exploit this defect by making adversarial perturbations\ndesigned to cause the classifier to give an incorrect output.\nClassification-with-rejection methods attempt to solve this problem by allowing\nnetworks to refuse to classify an input in which they have low confidence. This\nworks well for strongly adversarial examples, but also leads to the rejection\nof weakly perturbed images, which intuitively could be correctly classified. To\naddress these issues, we propose Reed-Muller Aggregation Networks (RMAggNet), a\nclassifier inspired by Reed-Muller error-correction codes which can correct and\nreject inputs. This paper shows that RMAggNet can minimise incorrectness while\nmaintaining good correctness over multiple adversarial attacks at different\nperturbation budgets by leveraging the ability to correct errors in the\nclassification process. This provides an alternative\nclassification-with-rejection method which can reduce the amount of additional\nprocessing in situations where a small number of incorrect classifications are\npermissible.\n","authors":["Daniel Fentham","David Parker","Mark Ryan"],"pdf_url":"https://arxiv.org/pdf/2309.06359v1.pdf","comment":"38 pages, 7 figures"},{"id":"http://arxiv.org/abs/2302.01248v2","updated":"2023-09-12T16:20:15Z","published":"2023-02-02T17:29:10Z","title":"Robust Markov Decision Processes without Model Estimation","summary":" Robust Markov Decision Processes (MDPs) are receiving much attention in\nlearning a robust policy which is less sensitive to environment changes. There\nare an increasing number of works analyzing sample-efficiency of robust MDPs.\nHowever, there are two major barriers to applying robust MDPs in practice.\nFirst, most works study robust MDPs in a model-based regime, where the\ntransition probability needs to be estimated and requires a large amount of\nmemories $\\mathcal{O}(|\\mathcal{S}|^2|\\mathcal{A}|)$. Second, prior work\ntypically assumes a strong oracle to obtain the optimal solution as an\nintermediate step to solve robust MDPs. However, in practice, such an oracle\ndoes not exist usually. To remove the oracle, we transform the original robust\nMDPs into an alternative form, which allows us to use stochastic gradient\nmethods to solve the robust MDPs. Moreover, we prove the alternative form still\nplays a similar role as the original form. With this new formulation, we devise\na sample-efficient algorithm to solve the robust MDPs in a model-free regime,\nwhich does not require an oracle and trades off a lower storage requirement\n$\\mathcal{O}(|\\mathcal{S}||\\mathcal{A}|)$ with being able to generate samples\nfrom a generative model or Markovian chain. Finally, we validate our\ntheoretical findings via numerical experiments, showing the efficiency with the\nalternative form of robust MDPs.\n","authors":["Wenhao Yang","Han Wang","Tadashi Kozuno","Scott M. Jordan","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.01248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06349v1","updated":"2023-09-12T16:15:33Z","published":"2023-09-12T16:15:33Z","title":"Generalized Regret Analysis of Thompson Sampling using Fractional\n Posteriors","summary":" Thompson sampling (TS) is one of the most popular and earliest algorithms to\nsolve stochastic multi-armed bandit problems. We consider a variant of TS,\nnamed $\\alpha$-TS, where we use a fractional or $\\alpha$-posterior\n($\\alpha\\in(0,1)$) instead of the standard posterior distribution. To compute\nan $\\alpha$-posterior, the likelihood in the definition of the standard\nposterior is tempered with a factor $\\alpha$. For $\\alpha$-TS we obtain both\ninstance-dependent $\\mathcal{O}\\left(\\sum_{k \\neq i^*}\n\\Delta_k\\left(\\frac{\\log(T)}{C(\\alpha)\\Delta_k^2} + \\frac{1}{2} \\right)\\right)$\nand instance-independent $\\mathcal{O}(\\sqrt{KT\\log K})$ frequentist regret\nbounds under very mild conditions on the prior and reward distributions, where\n$\\Delta_k$ is the gap between the true mean rewards of the $k^{th}$ and the\nbest arms, and $C(\\alpha)$ is a known constant. Both the sub-Gaussian and\nexponential family models satisfy our general conditions on the reward\ndistribution. Our conditions on the prior distribution just require its density\nto be positive, continuous, and bounded. We also establish another\ninstance-dependent regret upper bound that matches (up to constants) to that of\nimproved UCB [Auer and Ortner, 2010]. Our regret analysis carefully combines\nrecent theoretical developments in the non-asymptotic concentration analysis\nand Bernstein-von Mises type results for the $\\alpha$-posterior distribution.\nMoreover, our analysis does not require additional structural properties such\nas closed-form posteriors or conjugate priors.\n","authors":["Prateek Jaiswal","Debdeep Pati","Anirban Bhattacharya","Bani K. Mallick"],"pdf_url":"https://arxiv.org/pdf/2309.06349v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09063v2","updated":"2023-09-12T16:14:03Z","published":"2023-03-16T03:43:10Z","title":"Plant Disease Detection using Region-Based Convolutional Neural Network","summary":" Agriculture plays an important role in the food and economy of Bangladesh.\nThe rapid growth of population over the years also has increased the demand for\nfood production. One of the major reasons behind low crop production is\nnumerous bacteria, virus and fungal plant diseases. Early detection of plant\ndiseases and proper usage of pesticides and fertilizers are vital for\npreventing the diseases and boost the yield. Most of the farmers use\ngeneralized pesticides and fertilizers in the entire fields without\nspecifically knowing the condition of the plants. Thus the production cost\noftentimes increases, and, not only that, sometimes this becomes detrimental to\nthe yield. Deep Learning models are found to be very effective to automatically\ndetect plant diseases from images of plants, thereby reducing the need for\nhuman specialists. This paper aims at building a lightweight deep learning\nmodel for predicting leaf disease in tomato plants. By modifying the\nregion-based convolutional neural network, we design an efficient and effective\nmodel that demonstrates satisfactory empirical performance on a benchmark\ndataset. Our proposed model can easily be deployed in a larger system where\ndrones take images of leaves and these images will be fed into our model to\nknow the health condition.\n","authors":["Hasin Rehana","Muhammad Ibrahim","Md. Haider Ali"],"pdf_url":"https://arxiv.org/pdf/2303.09063v2.pdf","comment":"23 pages"},{"id":"http://arxiv.org/abs/2309.06348v1","updated":"2023-09-12T16:13:10Z","published":"2023-09-12T16:13:10Z","title":"Band-gap regression with architecture-optimized message-passing neural\n networks","summary":" Graph-based neural networks and, specifically, message-passing neural\nnetworks (MPNNs) have shown great potential in predicting physical properties\nof solids. In this work, we train an MPNN to first classify materials through\ndensity functional theory data from the AFLOW database as being metallic or\nsemiconducting/insulating. We then perform a neural-architecture search to\nexplore the model architecture and hyperparameter space of MPNNs to predict the\nband gaps of the materials identified as non-metals. The parameters in the\nsearch include the number of message-passing steps, latent size, and\nactivation-function, among others. The top-performing models from the search\nare pooled into an ensemble that significantly outperforms existing models from\nthe literature. Uncertainty quantification is evaluated with Monte-Carlo\nDropout and ensembling, with the ensemble method proving superior. The domain\nof applicability of the ensemble model is analyzed with respect to the crystal\nsystems, the inclusion of a Hubbard parameter in the density functional\ncalculations, and the atomic species building up the materials.\n","authors":["Tim Bechtel","Daniel T. Speckhard","Jonathan Godwin","Claudia Draxl"],"pdf_url":"https://arxiv.org/pdf/2309.06348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05718v2","updated":"2023-09-12T16:12:14Z","published":"2023-03-10T05:50:17Z","title":"Tradeoff of generalization error in unsupervised learning","summary":" Finding the optimal model complexity that minimizes the generalization error\n(GE) is a key issue of machine learning. For the conventional supervised\nlearning, this task typically involves the bias-variance tradeoff: lowering the\nbias by making the model more complex entails an increase in the variance.\nMeanwhile, little has been studied about whether the same tradeoff exists for\nunsupervised learning. In this study, we propose that unsupervised learning\ngenerally exhibits a two-component tradeoff of the GE, namely the model error\nand the data error -- using a more complex model reduces the model error at the\ncost of the data error, with the data error playing a more significant role for\na smaller training dataset. This is corroborated by training the restricted\nBoltzmann machine to generate the configurations of the two-dimensional Ising\nmodel at a given temperature and the totally asymmetric simple exclusion\nprocess with given entry and exit rates. Our results also indicate that the\noptimal model tends to be more complex when the data to be learned are more\ncomplex.\n","authors":["Gilhan Kim","Hojun Lee","Junghyo Jo","Yongjoo Baek"],"pdf_url":"https://arxiv.org/pdf/2303.05718v2.pdf","comment":"15 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.00398v2","updated":"2023-09-12T15:46:23Z","published":"2023-07-01T18:16:06Z","title":"ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models","summary":" Large-scale vision-language models (VLMs) like CLIP successfully find\ncorrespondences between images and text. Through the standard deterministic\nmapping process, an image or a text sample is mapped to a single vector in the\nembedding space. This is problematic: as multiple samples (images or text) can\nabstract the same concept in the physical world, deterministic embeddings do\nnot reflect the inherent ambiguity in the embedding space. We propose ProbVLM,\na probabilistic adapter that estimates probability distributions for the\nembeddings of pre-trained VLMs via inter/intra-modal alignment in a post-hoc\nmanner without needing large-scale datasets or computing. On four challenging\ndatasets, i.e., COCO, Flickr, CUB, and Oxford-flowers, we estimate the\nmulti-modal embedding uncertainties for two VLMs, i.e., CLIP and BLIP, quantify\nthe calibration of embedding uncertainties in retrieval tasks and show that\nProbVLM outperforms other methods. Furthermore, we propose active learning and\nmodel selection as two real-world downstream tasks for VLMs and show that the\nestimated uncertainty aids both tasks. Lastly, we present a novel technique for\nvisualizing the embedding distributions using a large-scale pre-trained latent\ndiffusion model. Code is available at https://github.com/ExplainableML/ProbVLM.\n","authors":["Uddeshya Upadhyay","Shyamgopal Karthik","Massimiliano Mancini","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2307.00398v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2106.03725v4","updated":"2023-09-12T15:45:20Z","published":"2021-06-07T15:41:03Z","title":"Stability to Deformations of Manifold Filters and Manifold Neural\n Networks","summary":" The paper defines and studies manifold (M) convolutional filters and neural\nnetworks (NNs). \\emph{Manifold} filters and MNNs are defined in terms of the\nLaplace-Beltrami operator exponential and are such that \\emph{graph} (G)\nfilters and neural networks (NNs) are recovered as discrete approximations when\nthe manifold is sampled. These filters admit a spectral representation which is\na generalization of both the spectral representation of graph filters and the\nfrequency response of standard convolutional filters in continuous time. The\nmain technical contribution of the paper is to analyze the stability of\nmanifold filters and MNNs to smooth deformations of the manifold. This analysis\ngeneralizes known stability properties of graph filters and GNNs and it is also\na generalization of known stability properties of standard convolutional\nfilters and neural networks in continuous time. The most important observation\nthat follows from this analysis is that manifold filters, same as graph filters\nand standard continuous time filters, have difficulty discriminating high\nfrequency components in the presence of deformations. This is a challenge that\ncan be ameliorated with the use of manifold, graph, or continuous time neural\nnetworks. The most important practical consequence of this analysis is to shed\nlight on the behavior of graph filters and GNNs in large scale graphs.\n","authors":["Zhiyang Wang","Luana Ruiz","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2106.03725v4.pdf","comment":"19 pages; 6 figures"},{"id":"http://arxiv.org/abs/2303.05445v2","updated":"2023-09-12T15:30:42Z","published":"2023-03-09T17:44:58Z","title":"Flooding with Absorption: An Efficient Protocol for Heterogeneous\n Bandits over Complex Networks","summary":" Multi-armed bandits are extensively used to model sequential decision-making,\nmaking them ubiquitous in many real-life applications such as online\nrecommender systems and wireless networking. We consider a multi-agent setting\nwhere each agent solves their own bandit instance endowed with a different set\nof arms. Their goal is to minimize their group regret while collaborating via\nsome communication protocol over a given network. Previous literature on this\nproblem only considered arm heterogeneity and networked agents separately. In\nthis work, we introduce a setting that encompasses both features. For this\nnovel setting, we first provide a rigorous regret analysis for a standard\nflooding protocol combined with the classic UCB policy. Then, to mitigate the\nissue of high communication costs incurred by flooding in complex networks, we\npropose a new protocol called Flooding with Absorption (FwA). We provide a\ntheoretical analysis of the resulting regret bound and discuss the advantages\nof using FwA over flooding. Lastly, we experimentally verify on various\nscenarios, including dynamic networks, that FwA leads to significantly lower\ncommunication costs despite minimal regret performance loss compared to other\nnetwork protocols.\n","authors":["Junghyun Lee","Laura Schmid","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2303.05445v2.pdf","comment":"26 pages, 7 figures; second revision"},{"id":"http://arxiv.org/abs/2309.06315v1","updated":"2023-09-12T15:27:00Z","published":"2023-09-12T15:27:00Z","title":"Learning Minimalistic Tsetlin Machine Clauses with Markov\n Boundary-Guided Pruning","summary":" A set of variables is the Markov blanket of a random variable if it contains\nall the information needed for predicting the variable. If the blanket cannot\nbe reduced without losing useful information, it is called a Markov boundary.\nIdentifying the Markov boundary of a random variable is advantageous because\nall variables outside the boundary are superfluous. Hence, the Markov boundary\nprovides an optimal feature set. However, learning the Markov boundary from\ndata is challenging for two reasons. If one or more variables are removed from\nthe Markov boundary, variables outside the boundary may start providing\ninformation. Conversely, variables within the boundary may stop providing\ninformation. The true role of each candidate variable is only manifesting when\nthe Markov boundary has been identified. In this paper, we propose a new\nTsetlin Machine (TM) feedback scheme that supplements Type I and Type II\nfeedback. The scheme introduces a novel Finite State Automaton - a\nContext-Specific Independence Automaton. The automaton learns which features\nare outside the Markov boundary of the target, allowing them to be pruned from\nthe TM during learning. We investigate the new scheme empirically, showing how\nit is capable of exploiting context-specific independence to find Markov\nboundaries. Further, we provide a theoretical analysis of convergence. Our\napproach thus connects the field of Bayesian networks (BN) with TMs,\npotentially opening up for synergies when it comes to inference and learning,\nincluding TM-produced Bayesian knowledge bases and TM-based Bayesian inference.\n","authors":["Ole-Christoffer Granmo","Per-Arne Andersen","Lei Jiao","Xuan Zhang","Christian Blakely","Tor Tveit"],"pdf_url":"https://arxiv.org/pdf/2309.06315v1.pdf","comment":"Accepted to ISTM2023, 8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.06313v1","updated":"2023-09-12T15:24:26Z","published":"2023-09-12T15:24:26Z","title":"Semantic and Articulated Pedestrian Sensing Onboard a Moving Vehicle","summary":" It is difficult to perform 3D reconstruction from on-vehicle gathered video\ndue to the large forward motion of the vehicle. Even object detection and human\nsensing models perform significantly worse on onboard videos when compared to\nstandard benchmarks because objects often appear far away from the camera\ncompared to the standard object detection benchmarks, image quality is often\ndecreased by motion blur and occlusions occur often. This has led to the\npopularisation of traffic data-specific benchmarks. Recently Light Detection\nAnd Ranging (LiDAR) sensors have become popular to directly estimate depths\nwithout the need to perform 3D reconstructions. However, LiDAR-based methods\nstill lack in articulated human detection at a distance when compared to\nimage-based methods. We hypothesize that benchmarks targeted at articulated\nhuman sensing from LiDAR data could bring about increased research in human\nsensing and prediction in traffic and could lead to improved traffic safety for\npedestrians.\n","authors":["Maria Priisalu"],"pdf_url":"https://arxiv.org/pdf/2309.06313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04296v2","updated":"2023-09-12T15:18:12Z","published":"2023-09-08T12:36:49Z","title":"Navigating Out-of-Distribution Electricity Load Forecasting during\n COVID-19: A Continual Learning Approach Leveraging Human Mobility","summary":" In traditional deep learning algorithms, one of the key assumptions is that\nthe data distribution remains constant during both training and deployment.\nHowever, this assumption becomes problematic when faced with\nOut-of-Distribution periods, such as the COVID-19 lockdowns, where the data\ndistribution significantly deviates from what the model has seen during\ntraining. This paper employs a two-fold strategy: utilizing continual learning\ntechniques to update models with new data and harnessing human mobility data\ncollected from privacy-preserving pedestrian counters located outside\nbuildings. In contrast to online learning, which suffers from 'catastrophic\nforgetting' as newly acquired knowledge often erases prior information,\ncontinual learning offers a holistic approach by preserving past insights while\nintegrating new data. This research applies FSNet, a powerful continual\nlearning algorithm, to real-world data from 13 building complexes in Melbourne,\nAustralia, a city which had the second longest total lockdown duration globally\nduring the pandemic. Results underscore the crucial role of continual learning\nin accurate energy forecasting, particularly during Out-of-Distribution\nperiods. Secondary data such as mobility and temperature provided ancillary\nsupport to the primary forecasting model. More importantly, while traditional\nmethods struggled to adapt during lockdowns, models featuring at least online\nlearning demonstrated resilience, with lockdown periods posing fewer challenges\nonce armed with adaptive learning techniques. This study contributes valuable\nmethodologies and insights to the ongoing effort to improve energy load\nforecasting during future Out-of-Distribution periods.\n","authors":["Arian Prabowo","Kaixuan Chen","Hao Xue","Subbu Sethuvenkatraman","Flora D. Salim"],"pdf_url":"https://arxiv.org/pdf/2309.04296v2.pdf","comment":"10 pages, 2 figures, 5 tables, BuildSys '23"},{"id":"http://arxiv.org/abs/2306.04054v2","updated":"2023-09-12T15:11:10Z","published":"2023-06-06T23:04:22Z","title":"RescueSpeech: A German Corpus for Speech Recognition in Search and\n Rescue Domain","summary":" Despite the recent advancements in speech recognition, there are still\ndifficulties in accurately transcribing conversational and emotional speech in\nnoisy and reverberant acoustic environments. This poses a particular challenge\nin the search and rescue (SAR) domain, where transcribing conversations among\nrescue team members is crucial to support real-time decision-making. The\nscarcity of speech data and associated background noise in SAR scenarios make\nit difficult to deploy robust speech recognition systems. To address this\nissue, we have created and made publicly available a German speech dataset\ncalled RescueSpeech. This dataset includes real speech recordings from\nsimulated rescue exercises. Additionally, we have released competitive training\nrecipes and pre-trained models. Our study highlights that the performance\nattained by state-of-the-art methods in this challenging scenario is still far\nfrom reaching an acceptable level.\n","authors":["Sangeet Sagar","Mirco Ravanelli","Bernd Kiefer","Ivana Kruijff Korbayova","Josef van Genabith"],"pdf_url":"https://arxiv.org/pdf/2306.04054v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07919v2","updated":"2023-09-12T15:08:46Z","published":"2022-12-15T15:52:39Z","title":"ROSCOE: A Suite of Metrics for Scoring Step-by-Step Reasoning","summary":" Large language models show improved downstream task performance when prompted\nto generate step-by-step reasoning to justify their final answers. These\nreasoning steps greatly improve model interpretability and verification, but\nobjectively studying their correctness (independent of the final answer) is\ndifficult without reliable methods for automatic evaluation. We simply do not\nknow how often the stated reasoning steps actually support the final end task\npredictions. In this work, we present ROSCOE, a suite of interpretable,\nunsupervised automatic scores that improve and extend previous text generation\nevaluation metrics. To evaluate ROSCOE against baseline metrics, we design a\ntypology of reasoning errors and collect synthetic and human evaluation scores\non commonly used reasoning datasets. In contrast with existing metrics, ROSCOE\ncan measure semantic consistency, logicality, informativeness, fluency, and\nfactuality - among other traits - by leveraging properties of step-by-step\nrationales. We empirically verify the strength of our metrics on five human\nannotated and six programmatically perturbed diagnostics datasets - covering a\ndiverse set of tasks that require reasoning skills and show that ROSCOE can\nconsistently outperform baseline metrics.\n","authors":["Olga Golovneva","Moya Chen","Spencer Poff","Martin Corredor","Luke Zettlemoyer","Maryam Fazel-Zarandi","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2212.07919v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06299v1","updated":"2023-09-12T15:05:11Z","published":"2023-09-12T15:05:11Z","title":"Modeling Supply and Demand in Public Transportation Systems","summary":" The Harrisonburg Department of Public Transportation (HDPT) aims to leverage\ntheir data to improve the efficiency and effectiveness of their operations. We\nconstruct two supply and demand models that help the department identify gaps\nin their service. The models take many variables into account, including the\nway that the HDPT reports to the federal government and the areas with the most\nvulnerable populations in Harrisonburg City. We employ data analysis and\nmachine learning techniques to make our predictions.\n","authors":["Miranda Bihler","Hala Nelson","Erin Okey","Noe Reyes Rivas","John Webb","Anna White"],"pdf_url":"https://arxiv.org/pdf/2309.06299v1.pdf","comment":"28 pages, 2022 REU project at James Madison University"},{"id":"http://arxiv.org/abs/2308.11890v2","updated":"2023-09-12T15:02:43Z","published":"2023-08-23T03:23:07Z","title":"Shape-conditioned 3D Molecule Generation via Equivariant Diffusion\n Models","summary":" Ligand-based drug design aims to identify novel drug candidates of similar\nshapes with known active molecules. In this paper, we formulated an in silico\nshape-conditioned molecule generation problem to generate 3D molecule\nstructures conditioned on the shape of a given molecule. To address this\nproblem, we developed a translation- and rotation-equivariant shape-guided\ngenerative model ShapeMol. ShapeMol consists of an equivariant shape encoder\nthat maps molecular surface shapes into latent embeddings, and an equivariant\ndiffusion model that generates 3D molecules based on these embeddings.\nExperimental results show that ShapeMol can generate novel, diverse, drug-like\nmolecules that retain 3D molecular shapes similar to the given shape condition.\nThese results demonstrate the potential of ShapeMol in designing drug\ncandidates of desired 3D shapes binding to protein target pockets.\n","authors":["Ziqi Chen","Bo Peng","Srinivasan Parthasarathy","Xia Ning"],"pdf_url":"https://arxiv.org/pdf/2308.11890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.02928v4","updated":"2023-09-12T15:00:10Z","published":"2022-03-06T10:14:09Z","title":"Fidelity of Interpretability Methods and Perturbation Artifacts in\n Neural Networks","summary":" Despite excellent performance of deep neural networks (DNNs) in image\nclassification, detection, and prediction, characterizing how DNNs make a given\ndecision remains an open problem, resulting in a number of interpretability\nmethods. Post-hoc interpretability methods primarily aim to quantify the\nimportance of input features with respect to the class probabilities. However,\ndue to the lack of ground truth and the existence of interpretability methods\nwith diverse operating characteristics, evaluating these methods is a crucial\nchallenge. A popular approach to evaluate interpretability methods is to\nperturb input features deemed important for a given prediction and observe the\ndecrease in accuracy. However, perturbation itself may introduce artifacts. We\npropose a method for estimating the impact of such artifacts on the fidelity\nestimation by utilizing model accuracy curves from perturbing input features\naccording to the Most Import First (MIF) and Least Import First (LIF) orders.\nUsing the ResNet-50 trained on the ImageNet, we demonstrate the proposed\nfidelity estimation of four popular post-hoc interpretability methods.\n","authors":["Lennart Brocki","Neo Christopher Chung"],"pdf_url":"https://arxiv.org/pdf/2203.02928v4.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2106.02466v3","updated":"2023-09-12T14:53:38Z","published":"2021-06-04T13:10:51Z","title":"Graph Barlow Twins: A self-supervised representation learning framework\n for graphs","summary":" The self-supervised learning (SSL) paradigm is an essential exploration area,\nwhich tries to eliminate the need for expensive data labeling. Despite the\ngreat success of SSL methods in computer vision and natural language\nprocessing, most of them employ contrastive learning objectives that require\nnegative samples, which are hard to define. This becomes even more challenging\nin the case of graphs and is a bottleneck for achieving robust representations.\nTo overcome such limitations, we propose a framework for self-supervised graph\nrepresentation learning - Graph Barlow Twins, which utilizes a\ncross-correlation-based loss function instead of negative samples. Moreover, it\ndoes not rely on non-symmetric neural network architectures - in contrast to\nstate-of-the-art self-supervised graph representation learning method BGRL. We\nshow that our method achieves as competitive results as the best\nself-supervised methods and fully supervised ones while requiring fewer\nhyperparameters and substantially shorter computation time (ca. 30 times faster\nthan BGRL).\n","authors":["Piotr Bielak","Tomasz Kajdanowicz","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2106.02466v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06286v1","updated":"2023-09-12T14:46:56Z","published":"2023-09-12T14:46:56Z","title":"Transferability analysis of data-driven additive manufacturing\n knowledge: a case study between powder bed fusion and directed energy\n deposition","summary":" Data-driven research in Additive Manufacturing (AM) has gained significant\nsuccess in recent years. This has led to a plethora of scientific literature to\nemerge. The knowledge in these works consists of AM and Artificial Intelligence\n(AI) contexts that have not been mined and formalized in an integrated way.\nMoreover, no tools or guidelines exist to support data-driven knowledge\ntransfer from one context to another. As a result, data-driven solutions using\nspecific AI techniques are being developed and validated only for specific AM\nprocess technologies. There is a potential to exploit the inherent similarities\nacross various AM technologies and adapt the existing solutions from one\nprocess or problem to another using AI, such as Transfer Learning. We propose a\nthree-step knowledge transferability analysis framework in AM to support\ndata-driven AM knowledge transfer. As a prerequisite to transferability\nanalysis, AM knowledge is featurized into identified knowledge components. The\nframework consists of pre-transfer, transfer, and post-transfer steps to\naccomplish knowledge transfer. A case study is conducted between flagship metal\nAM processes. Laser Powder Bed Fusion (LPBF) is the source of knowledge\nmotivated by its relative matureness in applying AI over Directed Energy\nDeposition (DED), which drives the need for knowledge transfer as the less\nexplored target process. We show successful transfer at different levels of the\ndata-driven solution, including data representation, model architecture, and\nmodel parameters. The pipeline of AM knowledge transfer can be automated in the\nfuture to allow efficient cross-context or cross-process knowledge exchange.\n","authors":["Mutahar Safdar","Jiarui Xie","Hyunwoong Ko","Yan Lu","Guy Lamouche","Yaoyao Fiona Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.06286v1.pdf","comment":"11 pages, 7 figures. This paper has been accepted to be published in\n the proceedings of IDETC-CIE 2023"},{"id":"http://arxiv.org/abs/2209.09423v2","updated":"2023-09-12T14:46:24Z","published":"2022-09-20T02:41:17Z","title":"Fairness and robustness in anti-causal prediction","summary":" Robustness to distribution shift and fairness have independently emerged as\ntwo important desiderata required of modern machine learning models. While\nthese two desiderata seem related, the connection between them is often unclear\nin practice. Here, we discuss these connections through a causal lens, focusing\non anti-causal prediction tasks, where the input to a classifier (e.g., an\nimage) is assumed to be generated as a function of the target label and the\nprotected attribute. By taking this perspective, we draw explicit connections\nbetween a common fairness criterion - separation - and a common notion of\nrobustness - risk invariance. These connections provide new motivation for\napplying the separation criterion in anticausal settings, and inform old\ndiscussions regarding fairness-performance tradeoffs. In addition, our findings\nsuggest that robustness-motivated approaches can be used to enforce separation,\nand that they often work better in practice than methods designed to directly\nenforce separation. Using a medical dataset, we empirically validate our\nfindings on the task of detecting pneumonia from X-rays, in a setting where\ndifferences in prevalence across sex groups motivates a fairness mitigation.\nOur findings highlight the importance of considering causal structure when\nchoosing and enforcing fairness criteria.\n","authors":["Maggie Makar","Alexander D'Amour"],"pdf_url":"https://arxiv.org/pdf/2209.09423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06274v1","updated":"2023-09-12T14:36:13Z","published":"2023-09-12T14:36:13Z","title":"ELRA: Exponential learning rate adaption gradient descent optimization\n method","summary":" We present a novel, fast (exponential rate adaption), ab initio\n(hyper-parameter-free) gradient based optimizer algorithm. The main idea of the\nmethod is to adapt the learning rate $\\alpha$ by situational awareness, mainly\nstriving for orthogonal neighboring gradients. The method has a high success\nand fast convergence rate and does not rely on hand-tuned parameters giving it\ngreater universality. It can be applied to problems of any dimensions n and\nscales only linearly (of order O(n)) with the dimension of the problem. It\noptimizes convex and non-convex continuous landscapes providing some kind of\ngradient. In contrast to the Ada-family (AdaGrad, AdaMax, AdaDelta, Adam, etc.)\nthe method is rotation invariant: optimization path and performance are\nindependent of coordinate choices. The impressive performance is demonstrated\nby extensive experiments on the MNIST benchmark data-set against\nstate-of-the-art optimizers. We name this new class of optimizers after its\ncore idea Exponential Learning Rate Adaption - ELRA. We present it in two\nvariants c2min and p2min with slightly different control. The authors strongly\nbelieve that ELRA will open a completely new research direction for gradient\ndescent optimize.\n","authors":["Alexander Kleinsorge","Stefan Kupper","Alexander Fauck","Felix Rothe"],"pdf_url":"https://arxiv.org/pdf/2309.06274v1.pdf","comment":"9 pages, 11 figures"},{"id":"http://arxiv.org/abs/2301.04404v3","updated":"2023-09-12T14:34:49Z","published":"2023-01-11T11:10:32Z","title":"A prediction and behavioural analysis of machine learning methods for\n modelling travel mode choice","summary":" The emergence of a variety of Machine Learning (ML) approaches for travel\nmode choice prediction poses an interesting question to transport modellers:\nwhich models should be used for which applications? The answer to this question\ngoes beyond simple predictive performance, and is instead a balance of many\nfactors, including behavioural interpretability and explainability,\ncomputational complexity, and data efficiency. There is a growing body of\nresearch which attempts to compare the predictive performance of different ML\nclassifiers with classical random utility models. However, existing studies\ntypically analyse only the disaggregate predictive performance, ignoring other\naspects affecting model choice. Furthermore, many studies are affected by\ntechnical limitations, such as the use of inappropriate validation schemes,\nincorrect sampling for hierarchical data, lack of external validation, and the\nexclusive use of discrete metrics. We address these limitations by conducting a\nsystematic comparison of different modelling approaches, across multiple\nmodelling problems, in terms of the key factors likely to affect model choice\n(out-of-sample predictive performance, accuracy of predicted market shares,\nextraction of behavioural indicators, and computational efficiency). We combine\nseveral real world datasets with synthetic datasets, where the data generation\nfunction is known. The results indicate that the models with the highest\ndisaggregate predictive performance (namely extreme gradient boosting and\nrandom forests) provide poorer estimates of behavioural indicators and\naggregate mode shares, and are more expensive to estimate, than other models,\nincluding deep neural networks and Multinomial Logit (MNL). It is further\nobserved that the MNL model performs robustly in a variety of situations,\nthough ML techniques can improve the estimates of behavioural indices such as\nWillingness to Pay.\n","authors":["José Ángel Martín-Baos","Julio Alberto López-Gómez","Luis Rodriguez-Benitez","Tim Hillel","Ricardo García-Ródenas"],"pdf_url":"https://arxiv.org/pdf/2301.04404v3.pdf","comment":"44 pages and 13 figures"},{"id":"http://arxiv.org/abs/2309.06268v1","updated":"2023-09-12T14:31:33Z","published":"2023-09-12T14:31:33Z","title":"ssVERDICT: Self-Supervised VERDICT-MRI for Enhanced Prostate Tumour\n Characterisation","summary":" MRI is increasingly being used in the diagnosis of prostate cancer (PCa),\nwith diffusion MRI (dMRI) playing an integral role. When combined with\ncomputational models, dMRI can estimate microstructural information such as\ncell size. Conventionally, such models are fit with a nonlinear least squares\n(NLLS) curve fitting approach, associated with a high computational cost.\nSupervised deep neural networks (DNNs) are an efficient alternative, however\ntheir performance is significantly affected by the underlying distribution of\nthe synthetic training data. Self-supervised learning is an attractive\nalternative, where instead of using a separate training dataset, the network\nlearns the features of the input data itself. This approach has only been\napplied to fitting of trivial dMRI models thus far. Here, we introduce a\nself-supervised DNN to estimate the parameters of the VERDICT (Vascular,\nExtracellular and Restricted DIffusion for Cytometry in Tumours) model for\nprostate. We demonstrate, for the first time, fitting of a complex\nthree-compartment biophysical model with machine learning without the\nrequirement of explicit training labels. We compare the estimation performance\nto baseline NLLS and supervised DNN methods, observing improvement in\nestimation accuracy and reduction in bias with respect to ground truth values.\nOur approach also achieves a higher confidence level for discrimination between\ncancerous and benign prostate tissue in comparison to the other methods on a\ndataset of 20 PCa patients, indicating potential for accurate tumour\ncharacterisation.\n","authors":["Snigdha Sen","Saurabh Singh","Hayley Pye","Caroline Moore","Hayley Whitaker","Shonit Punwani","David Atkinson","Eleftheria Panagiotaki","Paddy J. Slator"],"pdf_url":"https://arxiv.org/pdf/2309.06268v1.pdf","comment":"12 pages, 5 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2205.10456v3","updated":"2023-09-12T14:22:36Z","published":"2022-05-20T22:47:19Z","title":"PSO-Convolutional Neural Networks with Heterogeneous Learning Rate","summary":" Convolutional Neural Networks (ConvNets or CNNs) have been candidly deployed\nin the scope of computer vision and related fields. Nevertheless, the dynamics\nof training of these neural networks lie still elusive: it is hard and\ncomputationally expensive to train them. A myriad of architectures and training\nstrategies have been proposed to overcome this challenge and address several\nproblems in image processing such as speech, image and action recognition as\nwell as object detection. In this article, we propose a novel Particle Swarm\nOptimization (PSO) based training for ConvNets. In such framework, the vector\nof weights of each ConvNet is typically cast as the position of a particle in\nphase space whereby PSO collaborative dynamics intertwines with Stochastic\nGradient Descent (SGD) in order to boost training performance and\ngeneralization. Our approach goes as follows: i) [regular phase] each ConvNet\nis trained independently via SGD; ii) [collaborative phase] ConvNets share\namong themselves their current vector of weights (or particle-position) along\nwith their gradient estimates of the Loss function. Distinct step sizes are\ncoined by distinct ConvNets. By properly blending ConvNets with large (possibly\nrandom) step-sizes along with more conservative ones, we propose an algorithm\nwith competitive performance with respect to other PSO-based approaches on\nCifar-10 and Cifar-100 (accuracy of 98.31% and 87.48%). These accuracy levels\nare obtained by resorting to only four ConvNets -- such results are expected to\nscale with the number of collaborative ConvNets accordingly. We make our source\ncodes available for download https://github.com/leonlha/PSO-ConvNet-Dynamics.\n","authors":["Nguyen Huu Phong","Augusto Santos","Bernardete Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2205.10456v3.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2309.06260v1","updated":"2023-09-12T14:20:12Z","published":"2023-09-12T14:20:12Z","title":"Toward Discretization-Consistent Closure Schemes for Large Eddy\n Simulation Using Reinforcement Learning","summary":" We propose a novel method for developing discretization-consistent closure\nschemes for implicitly filtered Large Eddy Simulation (LES). In implicitly\nfiltered LES, the induced filter kernel, and thus the closure terms, are\ndetermined by the properties of the grid and the discretization operator,\nleading to additional computational subgrid terms that are generally unknown in\na priori analysis. Therefore, the task of adapting the coefficients of LES\nclosure models is formulated as a Markov decision process and solved in an a\nposteriori manner with Reinforcement Learning (RL). This allows to adjust the\nmodel to the actual discretization as it also incorporates the interaction\nbetween the discretization and the model itself. This optimization framework is\napplied to both explicit and implicit closure models. An element-local eddy\nviscosity model is optimized as the explicit model. For the implicit modeling,\nRL is applied to identify an optimal blending strategy for a hybrid\ndiscontinuous Galerkin (DG) and finite volume scheme. All newly derived models\nachieve accurate and consistent results, either matching or outperforming\nclassical state-of-the-art models for different discretizations and\nresolutions. Moreover, the explicit model is demonstrated to adapt its\ndistribution of viscosity within the DG elements to the inhomogeneous\ndiscretization properties of the operator. In the implicit case, the optimized\nhybrid scheme renders itself as a viable modeling ansatz that could initiate a\nnew class of high order schemes for compressible turbulence. Overall, the\nresults demonstrate that the proposed RL optimization can provide\ndiscretization-consistent closures that could reduce the uncertainty in\nimplicitly filtered LES.\n","authors":["Andrea Beck","Marius Kurz"],"pdf_url":"https://arxiv.org/pdf/2309.06260v1.pdf","comment":"24 pages, 14 figures"},{"id":"http://arxiv.org/abs/2303.17727v4","updated":"2023-09-12T14:17:53Z","published":"2023-03-30T22:03:43Z","title":"BOLT: An Automated Deep Learning Framework for Training and Deploying\n Large-Scale Search and Recommendation Models on Commodity CPU Hardware","summary":" Efficient large-scale neural network training and inference on commodity CPU\nhardware is of immense practical significance in democratizing deep learning\n(DL) capabilities. Presently, the process of training massive models consisting\nof hundreds of millions to billions of parameters requires the extensive use of\nspecialized hardware accelerators, such as GPUs, which are only accessible to a\nlimited number of institutions with considerable financial resources. Moreover,\nthere is often an alarming carbon footprint associated with training and\ndeploying these models. In this paper, we take a step towards addressing these\nchallenges by introducing BOLT, a sparse deep learning library for training\nlarge-scale search and recommendation models on standard CPU hardware. BOLT\nprovides a flexible, high-level API for constructing models that will be\nfamiliar to users of existing popular DL frameworks. By automatically tuning\nspecialized hyperparameters, BOLT also abstracts away the algorithmic details\nof sparse network training. We evaluate BOLT on a number of information\nretrieval tasks including product recommendations, text classification, graph\nneural networks, and personalization. We find that our proposed system achieves\ncompetitive performance with state-of-the-art techniques at a fraction of the\ncost and energy consumption and an order-of-magnitude faster inference time.\nBOLT has also been successfully deployed by multiple businesses to address\ncritical problems, and we highlight one customer case study in the field of\ne-commerce.\n","authors":["Nicholas Meisburger","Vihan Lakshman","Benito Geordie","Joshua Engels","David Torres Ramos","Pratik Pranav","Benjamin Coleman","Benjamin Meisburger","Shubh Gupta","Yashwanth Adunukota","Tharun Medini","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2303.17727v4.pdf","comment":"6 pages, 5 tables, 3 figures. CIKM 2023 (Applied Research Track)"},{"id":"http://arxiv.org/abs/2309.06256v1","updated":"2023-09-12T14:16:54Z","published":"2023-09-12T14:16:54Z","title":"Speciality vs Generality: An Empirical Study on Catastrophic Forgetting\n in Fine-tuning Foundation Models","summary":" Foundation models, including Vision Language Models (VLMs) and Large Language\nModels (LLMs), possess the $generality$ to handle diverse distributions and\ntasks, which stems from their extensive pre-training datasets. The fine-tuning\nof foundation models is a common practice to enhance task performance or align\nthe model's behavior with human expectations, allowing them to gain\n$speciality$. However, the small datasets used for fine-tuning may not\nadequately cover the diverse distributions and tasks encountered during\npre-training. Consequently, the pursuit of speciality during fine-tuning can\nlead to a loss of {generality} in the model, which is related to catastrophic\nforgetting (CF) in deep learning. In this study, we demonstrate this phenomenon\nin both VLMs and LLMs. For instance, fine-tuning VLMs like CLIP on ImageNet\nresults in a loss of generality in handling diverse distributions, and\nfine-tuning LLMs like Galactica in the medical domain leads to a loss in\nfollowing instructions and common sense.\n To address the trade-off between the speciality and generality, we\ninvestigate multiple regularization methods from continual learning, the weight\naveraging method (Wise-FT) from out-of-distributional (OOD) generalization,\nwhich interpolates parameters between pre-trained and fine-tuned models, and\nparameter-efficient fine-tuning methods like Low-Rank Adaptation (LoRA). Our\nfindings show that both continual learning and Wise-ft methods effectively\nmitigate the loss of generality, with Wise-FT exhibiting the strongest\nperformance in balancing speciality and generality.\n","authors":["Yong Lin","Lu Tan","Hangyu Lin","Zeming Zheng","Renjie Pi","Jipeng Zhang","Shizhe Diao","Haoxiang Wang","Han Zhao","Yuan Yao","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.06256v1.pdf","comment":"30 Pages"},{"id":"http://arxiv.org/abs/2309.06255v1","updated":"2023-09-12T14:16:34Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multi-modal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multi-modal cooperation, which could not jointly\nutilize all modalities well. Some methods are proposed to identify and enhance\nthe worse learnt modality, but are often hard to provide the fine-grained\nobservation of multi-modal cooperation at sample-level with theoretical\nsupport. Hence, it is essential to reasonably observe and improve the\nfine-grained cooperation between modalities, especially when facing realistic\nscenarios where the modality discrepancy could vary across different samples.\nTo this end, we introduce a fine-grained modality valuation metric to evaluate\nthe contribution of each modality at sample-level. Via modality valuation, we\nregretfully observe that the multi-modal model tends to rely on one specific\nmodality, resulting in other modalities being low-contributing. We further\nanalyze this issue and improve cooperation between modalities by enhancing the\ndiscriminative ability of low-contributing modalities in a targeted manner.\nOverall, our methods reasonably observe the fine-grained uni-modal contribution\nat sample-level and achieve considerable improvement on different multi-modal\nmodels.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2309.06248v1","updated":"2023-09-12T14:04:12Z","published":"2023-09-12T14:04:12Z","title":"Rethinking Evaluation Metric for Probability Estimation Models Using\n Esports Data","summary":" Probability estimation models play an important role in various fields, such\nas weather forecasting, recommendation systems, and sports analysis. Among\nseveral models estimating probabilities, it is difficult to evaluate which\nmodel gives reliable probabilities since the ground-truth probabilities are not\navailable. The win probability estimation model for esports, which calculates\nthe win probability under a certain game state, is also one of the fields being\nactively studied in probability estimation. However, most of the previous works\nevaluated their models using accuracy, a metric that only can measure the\nperformance of discrimination. In this work, we firstly investigate the Brier\nscore and the Expected Calibration Error (ECE) as a replacement of accuracy\nused as a performance evaluation metric for win probability estimation models\nin esports field. Based on the analysis, we propose a novel metric called\nBalance score which is a simple yet effective metric in terms of six good\nproperties that probability estimation metric should have. Under the general\ncondition, we also found that the Balance score can be an effective\napproximation of the true expected calibration error which has been imperfectly\napproximated by ECE using the binning technique. Extensive evaluations using\nsimulation studies and real game snapshot data demonstrate the promising\npotential to adopt the proposed metric not only for the win probability\nestimation model for esports but also for evaluating general probability\nestimation models.\n","authors":["Euihyeon Choi","Jooyoung Kim","Wonkyung Lee"],"pdf_url":"https://arxiv.org/pdf/2309.06248v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2309.06240v1","updated":"2023-09-12T13:58:04Z","published":"2023-09-12T13:58:04Z","title":"Consistency and adaptivity are complementary targets for the validation\n of variance-based uncertainty quantification metrics in machine learning\n regression tasks","summary":" Reliable uncertainty quantification (UQ) in machine learning (ML) regression\ntasks is becoming the focus of many studies in materials and chemical science.\nIt is now well understood that average calibration is insufficient, and most\nstudies implement additional methods testing the conditional calibration with\nrespect to uncertainty, i.e. consistency. Consistency is assessed mostly by\nso-called reliability diagrams. There exists however another way beyond average\ncalibration, which is conditional calibration with respect to input features,\ni.e. adaptivity. In practice, adaptivity is the main concern of the final users\nof a ML-UQ method, seeking for the reliability of predictions and uncertainties\nfor any point in features space. This article aims to show that consistency and\nadaptivity are complementary validation targets, and that a good consistency\ndoes not imply a good adaptivity. Adapted validation methods are proposed and\nillustrated on a representative example.\n","authors":["Pascal Pernot"],"pdf_url":"https://arxiv.org/pdf/2309.06240v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.07170"},{"id":"http://arxiv.org/abs/2309.06239v1","updated":"2023-09-12T13:55:01Z","published":"2023-09-12T13:55:01Z","title":"Risk-Aware Reinforcement Learning through Optimal Transport Theory","summary":" In the dynamic and uncertain environments where reinforcement learning (RL)\noperates, risk management becomes a crucial factor in ensuring reliable\ndecision-making. Traditional RL approaches, while effective in reward\noptimization, often overlook the landscape of potential risks. In response,\nthis paper pioneers the integration of Optimal Transport (OT) theory with RL to\ncreate a risk-aware framework. Our approach modifies the objective function,\nensuring that the resulting policy not only maximizes expected rewards but also\nrespects risk constraints dictated by OT distances between state visitation\ndistributions and the desired risk profiles. By leveraging the mathematical\nprecision of OT, we offer a formulation that elevates risk considerations\nalongside conventional RL objectives. Our contributions are substantiated with\na series of theorems, mapping the relationships between risk distributions,\noptimal value functions, and policy behaviors. Through the lens of OT, this\nwork illuminates a promising direction for RL, ensuring a balanced fusion of\nreward pursuit and risk awareness.\n","authors":["Ali Baheri"],"pdf_url":"https://arxiv.org/pdf/2309.06239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05587v2","updated":"2023-09-12T13:52:55Z","published":"2023-02-11T03:35:13Z","title":"Hierarchical Optimization-Derived Learning","summary":" In recent years, by utilizing optimization techniques to formulate the\npropagation of deep model, a variety of so-called Optimization-Derived Learning\n(ODL) approaches have been proposed to address diverse learning and vision\ntasks. Although having achieved relatively satisfying practical performance,\nthere still exist fundamental issues in existing ODL methods. In particular,\ncurrent ODL methods tend to consider model construction and learning as two\nseparate phases, and thus fail to formulate their underlying coupling and\ndepending relationship. In this work, we first establish a new framework, named\nHierarchical ODL (HODL), to simultaneously investigate the intrinsic behaviors\nof optimization-derived model construction and its corresponding learning\nprocess. Then we rigorously prove the joint convergence of these two sub-tasks,\nfrom the perspectives of both approximation quality and stationary analysis. To\nour best knowledge, this is the first theoretical guarantee for these two\ncoupled ODL components: optimization and learning. We further demonstrate the\nflexibility of our framework by applying HODL to challenging learning tasks,\nwhich have not been properly addressed by existing ODL methods. Finally, we\nconduct extensive experiments on both synthetic data and real applications in\nvision and other learning tasks to verify the theoretical properties and\npractical performance of HODL in various application scenarios.\n","authors":["Risheng Liu","Xuan Liu","Shangzhi Zeng","Jin Zhang","Yixuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.05587v2.pdf","comment":"Accepted by IEEE TPAMI, 16 pages"},{"id":"http://arxiv.org/abs/2309.06236v1","updated":"2023-09-12T13:51:29Z","published":"2023-09-12T13:51:29Z","title":"The first step is the hardest: Pitfalls of Representing and Tokenizing\n Temporal Data for Large Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable generalization\nacross diverse tasks, leading individuals to increasingly use them as personal\nassistants and universal computing engines. Nevertheless, a notable obstacle\nemerges when feeding numerical/temporal data into these models, such as data\nsourced from wearables or electronic health records. LLMs employ tokenizers in\ntheir input that break down text into smaller units. However, tokenizers are\nnot designed to represent numerical values and might struggle to understand\nrepetitive patterns and context, treating consecutive values as separate tokens\nand disregarding their temporal relationships. Here, we discuss recent works\nthat employ LLMs for human-centric tasks such as in mobile health sensing and\npresent a case study showing that popular LLMs tokenize temporal data\nincorrectly. To address that, we highlight potential solutions such as prompt\ntuning with lightweight embedding layers as well as multimodal adapters, that\ncan help bridge this \"modality gap\". While the capability of language models to\ngeneralize to other modalities with minimal or no finetuning is exciting, this\npaper underscores the fact that their outputs cannot be meaningful if they\nstumble over input nuances.\n","authors":["Dimitris Spathis","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2309.06236v1.pdf","comment":"Accepted at the Generative AI for Pervasive Computing Symposium\n (GenAI4PC) at UbiComp 2023"},{"id":"http://arxiv.org/abs/2309.06230v1","updated":"2023-09-12T13:48:06Z","published":"2023-09-12T13:48:06Z","title":"A Consistent and Scalable Algorithm for Best Subset Selection in Single\n Index Models","summary":" Analysis of high-dimensional data has led to increased interest in both\nsingle index models (SIMs) and best subset selection. SIMs provide an\ninterpretable and flexible modeling framework for high-dimensional data, while\nbest subset selection aims to find a sparse model from a large set of\npredictors. However, best subset selection in high-dimensional models is known\nto be computationally intractable. Existing methods tend to relax the\nselection, but do not yield the best subset solution. In this paper, we\ndirectly tackle the intractability by proposing the first provably scalable\nalgorithm for best subset selection in high-dimensional SIMs. Our algorithmic\nsolution enjoys the subset selection consistency and has the oracle property\nwith a high probability. The algorithm comprises a generalized information\ncriterion to determine the support size of the regression coefficients,\neliminating the model selection tuning. Moreover, our method does not assume an\nerror distribution or a specific link function and hence is flexible to apply.\nExtensive simulation results demonstrate that our method is not only\ncomputationally efficient but also able to exactly recover the best subset in\nvarious settings (e.g., linear regression, Poisson regression, heteroscedastic\nmodels).\n","authors":["Borui Tang","Jin Zhu","Junxian Zhu","Xueqin Wang","Heping Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.06230v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02070v2","updated":"2023-09-12T13:43:38Z","published":"2023-02-04T02:47:41Z","title":"Semantic-Guided Generative Image Augmentation Method with Diffusion\n Models for Image Classification","summary":" Existing image augmentation methods consist of two categories:\nperturbation-based methods and generative methods. Perturbation-based methods\napply pre-defined perturbations to augment an original image, but only locally\nvary the image, thus lacking image diversity. In contrast, generative methods\nbring more image diversity in the augmented images but may not preserve\nsemantic consistency, thus incorrectly changing the essential semantics of the\noriginal image. To balance image diversity and semantic consistency in\naugmented images, we propose SGID, a Semantic-guided Generative Image\naugmentation method with Diffusion models for image classification.\nSpecifically, SGID employs diffusion models to generate augmented images with\ngood image diversity. More importantly, SGID takes image labels and captions as\nguidance to maintain semantic consistency between the augmented and original\nimages. Experimental results show that SGID outperforms the best augmentation\nbaseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and\n0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image\naugmentation baselines and further improves the overall performance. We\ndemonstrate the semantic consistency and image diversity of SGID through\nquantitative human and automated evaluations, as well as qualitative case\nstudies.\n","authors":["Bohan Li","Xiao Xu","Xinghao Wang","Yutai Hou","Yunlong Feng","Feng Wang","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2302.02070v2.pdf","comment":"17 pages, 13 figures, 8 tables"},{"id":"http://arxiv.org/abs/2309.06212v1","updated":"2023-09-12T13:28:06Z","published":"2023-09-12T13:28:06Z","title":"Long-term drought prediction using deep neural networks based on\n geospatial weather data","summary":" The accurate prediction of drought probability in specific regions is crucial\nfor informed decision-making in agricultural practices. It is important to make\npredictions one year in advance, particularly for long-term decisions. However,\nforecasting this probability presents challenges due to the complex interplay\nof various factors within the region of interest and neighboring areas. In this\nstudy, we propose an end-to-end solution to address this issue based on various\nspatiotemporal neural networks. The models considered focus on predicting the\ndrought intensity based on the Palmer Drought Severity Index (PDSI) for\nsubregions of interest, leveraging intrinsic factors and insights from climate\nmodels to enhance drought predictions.\n Comparative evaluations demonstrate the superior accuracy of Convolutional\nLSTM (ConvLSTM) and transformer models compared to baseline gradient boosting\nand logistic regression solutions. The two former models achieved impressive\nROC AUC scores from 0.90 to 0.70 for forecast horizons from one to six months,\noutperforming baseline models. The transformer showed superiority for shorter\nhorizons, while ConvLSTM did so for longer horizons. Thus, we recommend\nselecting the models accordingly for long-term drought forecasting.\n To ensure the broad applicability of the considered models, we conduct\nextensive validation across regions worldwide, considering different\nenvironmental conditions. We also run several ablation and sensitivity studies\nto challenge our findings and provide additional information on how to solve\nthe problem.\n","authors":["Vsevolod Grabar","Alexander Marusov","Alexey Zaytsev","Yury Maximov","Nazar Sotiriadi","Alexander Bulkin"],"pdf_url":"https://arxiv.org/pdf/2309.06212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06195v1","updated":"2023-09-12T13:03:47Z","published":"2023-09-12T13:03:47Z","title":"Optimization Guarantees of Unfolded ISTA and ADMM Networks With Smooth\n Soft-Thresholding","summary":" Solving linear inverse problems plays a crucial role in numerous\napplications. Algorithm unfolding based, model-aware data-driven approaches\nhave gained significant attention for effectively addressing these problems.\nLearned iterative soft-thresholding algorithm (LISTA) and alternating direction\nmethod of multipliers compressive sensing network (ADMM-CSNet) are two widely\nused such approaches, based on ISTA and ADMM algorithms, respectively. In this\nwork, we study optimization guarantees, i.e., achieving near-zero training loss\nwith the increase in the number of learning epochs, for finite-layer unfolded\nnetworks such as LISTA and ADMM-CSNet with smooth soft-thresholding in an\nover-parameterized (OP) regime. We achieve this by leveraging a modified\nversion of the Polyak-Lojasiewicz, denoted PL$^*$, condition. Satisfying the\nPL$^*$ condition within a specific region of the loss landscape ensures the\nexistence of a global minimum and exponential convergence from initialization\nusing gradient descent based methods. Hence, we provide conditions, in terms of\nthe network width and the number of training samples, on these unfolded\nnetworks for the PL$^*$ condition to hold. We achieve this by deriving the\nHessian spectral norm of these networks. Additionally, we show that the\nthreshold on the number of training samples increases with the increase in the\nnetwork width. Furthermore, we compare the threshold on training samples of\nunfolded networks with that of a standard fully-connected feed-forward network\n(FFNN) with smooth soft-thresholding non-linearity. We prove that unfolded\nnetworks have a higher threshold value than FFNN. Consequently, one can expect\na better expected error for unfolded networks than FFNN.\n","authors":["Shaik Basheeruddin Shah","Pradyumna Pradhan","Wei Pu","Ramunaidu Randhi","Miguel R. D. Rodrigues","Yonina C. Eldar"],"pdf_url":"https://arxiv.org/pdf/2309.06195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06183v1","updated":"2023-09-12T12:51:12Z","published":"2023-09-12T12:51:12Z","title":"Assessing the Generalization Gap of Learning-Based Speech Enhancement\n Systems in Noisy and Reverberant Environments","summary":" The acoustic variability of noisy and reverberant speech mixtures is\ninfluenced by multiple factors, such as the spectro-temporal characteristics of\nthe target speaker and the interfering noise, the signal-to-noise ratio (SNR)\nand the room characteristics. This large variability poses a major challenge\nfor learning-based speech enhancement systems, since a mismatch between the\ntraining and testing conditions can substantially reduce the performance of the\nsystem. Generalization to unseen conditions is typically assessed by testing\nthe system with a new speech, noise or binaural room impulse response (BRIR)\ndatabase different from the one used during training. However, the difficulty\nof the speech enhancement task can change across databases, which can\nsubstantially influence the results. The present study introduces a\ngeneralization assessment framework that uses a reference model trained on the\ntest condition, such that it can be used as a proxy for the difficulty of the\ntest condition. This allows to disentangle the effect of the change in task\ndifficulty from the effect of dealing with new data, and thus to define a new\nmeasure of generalization performance termed the generalization gap. The\nprocedure is repeated in a cross-validation fashion by cycling through multiple\nspeech, noise, and BRIR databases to accurately estimate the generalization\ngap. The proposed framework is applied to evaluate the generalization potential\nof a feedforward neural network (FFNN), Conv-TasNet, DCCRN and MANNER. We find\nthat for all models, the performance degrades the most in speech mismatches,\nwhile good noise and room generalization can be achieved by training on\nmultiple databases. Moreover, while recent models show higher performance in\nmatched conditions, their performance substantially decreases in mismatched\nconditions and can become inferior to that of the FFNN-based system.\n","authors":["Philippe Gonzalez","Tommy Sonne Alstrøm","Tobias May"],"pdf_url":"https://arxiv.org/pdf/2309.06183v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2309.06180v1","updated":"2023-09-12T12:50:04Z","published":"2023-09-12T12:50:04Z","title":"Efficient Memory Management for Large Language Model Serving with\n PagedAttention","summary":" High throughput serving of large language models (LLMs) requires batching\nsufficiently many requests at a time. However, existing systems struggle\nbecause the key-value cache (KV cache) memory for each request is huge and\ngrows and shrinks dynamically. When managed inefficiently, this memory can be\nsignificantly wasted by fragmentation and redundant duplication, limiting the\nbatch size. To address this problem, we propose PagedAttention, an attention\nalgorithm inspired by the classical virtual memory and paging techniques in\noperating systems. On top of it, we build vLLM, an LLM serving system that\nachieves (1) near-zero waste in KV cache memory and (2) flexible sharing of KV\ncache within and across requests to further reduce memory usage. Our\nevaluations show that vLLM improves the throughput of popular LLMs by\n2-4$\\times$ with the same level of latency compared to the state-of-the-art\nsystems, such as FasterTransformer and Orca. The improvement is more pronounced\nwith longer sequences, larger models, and more complex decoding algorithms.\nvLLM's source code is publicly available at\nhttps://github.com/vllm-project/vllm\n","authors":["Woosuk Kwon","Zhuohan Li","Siyuan Zhuang","Ying Sheng","Lianmin Zheng","Cody Hao Yu","Joseph E. Gonzalez","Hao Zhang","Ion Stoica"],"pdf_url":"https://arxiv.org/pdf/2309.06180v1.pdf","comment":"SOSP 2023"},{"id":"http://arxiv.org/abs/2306.10388v2","updated":"2023-09-12T12:29:14Z","published":"2023-06-17T16:17:48Z","title":"Breaking On-device Training Memory Wall: A Systematic Survey","summary":" On-device training has become an increasingly popular approach to machine\nlearning, enabling models to be trained directly on mobile and edge devices.\nHowever, a major challenge in this area is the limited memory available on\nthese devices, which can severely restrict the size and complexity of the\nmodels that can be trained. In this systematic survey, we aim to explore the\ncurrent state-of-the-art techniques for breaking on-device training memory\nwalls, focusing on methods that can enable larger and more complex models to be\ntrained on resource-constrained devices.\n Specifically, we first analyze the key factors that contribute to the\nphenomenon of memory walls encountered during on-device training. Then, we\npresent a comprehensive literature review of on-device training, which\naddresses the issue of memory limitations. Finally, we summarize on-device\ntraining and highlight the open problems for future research.\n By providing a comprehensive overview of these techniques and their\neffectiveness in breaking memory walls, we hope to help researchers and\npractitioners in this field navigate the rapidly evolving landscape of\non-device training.\n","authors":["Shitian Li","Chunlin Tian","Kahou Tam","Rui Ma","Li Li"],"pdf_url":"https://arxiv.org/pdf/2306.10388v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.06169v1","updated":"2023-09-12T12:27:17Z","published":"2023-09-12T12:27:17Z","title":"Elucidating the solution space of extended reverse-time SDE for\n diffusion models","summary":" Diffusion models (DMs) demonstrate potent image generation capabilities in\nvarious generative modeling tasks. Nevertheless, their primary limitation lies\nin slow sampling speed, requiring hundreds or thousands of sequential function\nevaluations through large neural networks to generate high-quality images.\nSampling from DMs can be seen as solving corresponding stochastic differential\nequations (SDEs) or ordinary differential equations (ODEs). In this work, we\nformulate the sampling process as an extended reverse-time SDE (ER SDE),\nunifying prior explorations into ODEs and SDEs. Leveraging the semi-linear\nstructure of ER SDE solutions, we offer exact solutions and arbitrarily\nhigh-order approximate solutions for VP SDE and VE SDE, respectively. Based on\nthe solution space of the ER SDE, we yield mathematical insights elucidating\nthe superior performance of ODE solvers over SDE solvers in terms of fast\nsampling. Additionally, we unveil that VP SDE solvers stand on par with their\nVE SDE counterparts. Finally, we devise fast and training-free samplers, ER-SDE\nSolvers, elevating the efficiency of stochastic samplers to unprecedented\nlevels. Experimental results demonstrate achieving 3.45 FID in 20 function\nevaluations and 2.24 FID in 50 function evaluations on the ImageNet\n64$\\times$64 dataset.\n","authors":["Qinpeng Cui","Xinyi Zhang","Zongqing Lu","Qingmin Liao"],"pdf_url":"https://arxiv.org/pdf/2309.06169v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06166v1","updated":"2023-09-12T12:23:49Z","published":"2023-09-12T12:23:49Z","title":"Certified Robust Models with Slack Control and Large Lipschitz Constants","summary":" Despite recent success, state-of-the-art learning-based models remain highly\nvulnerable to input changes such as adversarial examples. In order to obtain\ncertifiable robustness against such perturbations, recent work considers\nLipschitz-based regularizers or constraints while at the same time increasing\nprediction margin. Unfortunately, this comes at the cost of significantly\ndecreased accuracy. In this paper, we propose a Calibrated Lipschitz-Margin\nLoss (CLL) that addresses this issue and improves certified robustness by\ntackling two problems: Firstly, commonly used margin losses do not adjust the\npenalties to the shrinking output distribution; caused by minimizing the\nLipschitz constant $K$. Secondly, and most importantly, we observe that\nminimization of $K$ can lead to overly smooth decision functions. This limits\nthe model's complexity and thus reduces accuracy. Our CLL addresses these\nissues by explicitly calibrating the loss w.r.t. margin and Lipschitz constant,\nthereby establishing full control over slack and improving robustness\ncertificates even with larger Lipschitz constants. On CIFAR-10, CIFAR-100 and\nTiny-ImageNet, our models consistently outperform losses that leave the\nconstant unattended. On CIFAR-100 and Tiny-ImageNet, CLL improves upon\nstate-of-the-art deterministic $L_2$ robust accuracies. In contrast to current\ntrends, we unlock potential of much smaller models without $K=1$ constraints.\n","authors":["Max Losch","David Stutz","Bernt Schiele","Mario Fritz"],"pdf_url":"https://arxiv.org/pdf/2309.06166v1.pdf","comment":"To be published at GCPR 2023"},{"id":"http://arxiv.org/abs/2305.06295v2","updated":"2023-09-12T12:13:18Z","published":"2023-05-10T16:36:54Z","title":"Extracting Diagnosis Pathways from Electronic Health Records Using Deep\n Reinforcement Learning","summary":" Clinical diagnosis guidelines aim at specifying the steps that may lead to a\ndiagnosis. Inspired by guidelines, we aim to learn the optimal sequence of\nactions to perform in order to obtain a correct diagnosis from electronic\nhealth records. We apply various deep reinforcement learning algorithms to this\ntask and experiment on a synthetic but realistic dataset to differentially\ndiagnose anemia and its subtypes and particularly evaluate the robustness of\nvarious approaches to noise and missing data. Experimental results show that\nthe deep reinforcement learning algorithms show competitive performance\ncompared to the state-of-the-art methods with the added advantage that they\nenable the progressive generation of a pathway to the suggested diagnosis,\nwhich can both guide and explain the decision process.\n","authors":["Lillian Muyama","Antoine Neuraz","Adrien Coulet"],"pdf_url":"https://arxiv.org/pdf/2305.06295v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05812v2","updated":"2023-09-12T12:06:40Z","published":"2023-07-11T21:38:52Z","title":"Safe Reinforcement Learning for Strategic Bidding of Virtual Power\n Plants in Day-Ahead Markets","summary":" This paper presents a novel safe reinforcement learning algorithm for\nstrategic bidding of Virtual Power Plants (VPPs) in day-ahead electricity\nmarkets. The proposed algorithm utilizes the Deep Deterministic Policy Gradient\n(DDPG) method to learn competitive bidding policies without requiring an\naccurate market model. Furthermore, to account for the complex internal\nphysical constraints of VPPs we introduce two enhancements to the DDPG method.\nFirstly, a projection-based safety shield that restricts the agent's actions to\nthe feasible space defined by the non-linear power flow equations and operating\nconstraints of distributed energy resources is derived. Secondly, a penalty for\nthe shield activation in the reward function that incentivizes the agent to\nlearn a safer policy is introduced. A case study based on the IEEE 13-bus\nnetwork demonstrates the effectiveness of the proposed approach in enabling the\nagent to learn a highly competitive, safe strategic policy.\n","authors":["Ognjen Stanojev","Lesia Mitridati","Riccardo de Nardis di Prata","Gabriela Hug"],"pdf_url":"https://arxiv.org/pdf/2307.05812v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06157v1","updated":"2023-09-12T11:58:53Z","published":"2023-09-12T11:58:53Z","title":"Robust-MBDL: A Robust Multi-branch Deep Learning Based Model for\n Remaining Useful Life Prediction and Operational Condition Identification of\n Rotating Machines","summary":" In this paper, a Robust Multi-branch Deep learning-based system for remaining\nuseful life (RUL) prediction and condition operations (CO) identification of\nrotating machines is proposed. In particular, the proposed system comprises\nmain components: (1) an LSTM-Autoencoder to denoise the vibration data; (2) a\nfeature extraction to generate time-domain, frequency-domain, and\ntime-frequency based features from the denoised data; (3) a novel and robust\nmulti-branch deep learning network architecture to exploit the multiple\nfeatures. The performance of our proposed system was evaluated and compared to\nthe state-of-the-art systems on two benchmark datasets of XJTU-SY and\nPRONOSTIA. The experimental results prove that our proposed system outperforms\nthe state-of-the-art systems and presents potential for real-life applications\non bearing machines.\n","authors":["Khoa Tran","Hai-Canh Vu","Lam Pham","Nassim Boudaoud"],"pdf_url":"https://arxiv.org/pdf/2309.06157v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05319v3","updated":"2023-09-12T11:46:55Z","published":"2023-02-10T15:28:55Z","title":"Large Language Models for Code: Security Hardening and Adversarial\n Testing","summary":" Large language models (large LMs) are increasingly trained on massive\ncodebases and used to generate code. However, LMs lack awareness of security\nand are found to frequently produce unsafe code. This work studies the security\nof LMs along two important axes: (i) security hardening, which aims to enhance\nLMs' reliability in generating secure code, and (ii) adversarial testing, which\nseeks to evaluate LMs' security at an adversarial standpoint. We address both\nof these by formulating a new security task called controlled code generation.\nThe task is parametric and takes as input a binary property to guide the LM to\ngenerate secure or unsafe code, while preserving the LM's capability of\ngenerating functionally correct code. We propose a novel learning-based\napproach called SVEN to solve this task. SVEN leverages property-specific\ncontinuous vectors to guide program generation towards the given property,\nwithout modifying the LM's weights. Our training procedure optimizes these\ncontinuous vectors by enforcing specialized loss terms on different regions of\ncode, using a high-quality dataset carefully curated by us. Our extensive\nevaluation shows that SVEN is highly effective in achieving strong security\ncontrol. For instance, a state-of-the-art CodeGen LM with 2.7B parameters\ngenerates secure code for 59.1% of the time. When we employ SVEN to perform\nsecurity hardening (or adversarial testing) on this LM, the ratio is\nsignificantly boosted to 92.3% (or degraded to 36.8%). Importantly, SVEN\nclosely matches the original LMs in functional correctness.\n","authors":["Jingxuan He","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2302.05319v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06142v1","updated":"2023-09-12T11:29:12Z","published":"2023-09-12T11:29:12Z","title":"Towards Reliable Domain Generalization: A New Dataset and Evaluations","summary":" There are ubiquitous distribution shifts in the real world. However, deep\nneural networks (DNNs) are easily biased towards the training set, which causes\nsevere performance degradation when they receive out-of-distribution data. Many\nmethods are studied to train models that generalize under various distribution\nshifts in the literature of domain generalization (DG). However, the recent\nDomainBed and WILDS benchmarks challenged the effectiveness of these methods.\nAiming at the problems in the existing research, we propose a new domain\ngeneralization task for handwritten Chinese character recognition (HCCR) to\nenrich the application scenarios of DG method research. We evaluate eighteen DG\nmethods on the proposed PaHCC (Printed and Handwritten Chinese Characters)\ndataset and show that the performance of existing methods on this dataset is\nstill unsatisfactory. Besides, under a designed dynamic DG setting, we reveal\nmore properties of DG methods and argue that only the leave-one-domain-out\nprotocol is unreliable. We advocate that researchers in the DG community refer\nto dynamic performance of methods for more comprehensive and reliable\nevaluation. Our dataset and evaluations bring new perspectives to the community\nfor more substantial progress. We will make our dataset public with the article\npublished to facilitate the study of domain generalization.\n","authors":["Jiao Zhang","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06142v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.00172v3","updated":"2023-09-12T11:10:09Z","published":"2021-09-01T03:56:20Z","title":"Task-Oriented Communication for Multi-Device Cooperative Edge Inference","summary":" This paper investigates task-oriented communication for multi-device\ncooperative edge inference, where a group of distributed low-end edge devices\ntransmit the extracted features of local samples to a powerful edge server for\ninference. While cooperative edge inference can overcome the limited sensing\ncapability of a single device, it substantially increases the communication\noverhead and may incur excessive latency. To enable low-latency cooperative\ninference, we propose a learning-based communication scheme that optimizes\nlocal feature extraction and distributed feature encoding in a task-oriented\nmanner, i.e., to remove data redundancy and transmit information that is\nessential for the downstream inference task rather than reconstructing the data\nsamples at the edge server. Specifically, we leverage an information bottleneck\n(IB) principle to extract the task-relevant feature at each edge device and\nadopt a distributed information bottleneck (DIB) framework to formalize a\nsingle-letter characterization of the optimal rate-relevance tradeoff for\ndistributed feature encoding. To admit flexible control of the communication\noverhead, we extend the DIB framework to a distributed deterministic\ninformation bottleneck (DDIB) objective that explicitly incorporates the\nrepresentational costs of the encoded features. As the IB-based objectives are\ncomputationally prohibitive for high-dimensional data, we adopt variational\napproximations to make the optimization problems tractable. To compensate the\npotential performance loss due to the variational approximations, we also\ndevelop a selective retransmission (SR) mechanism to identify the redundancy in\nthe encoded features of multiple edge devices to attain additional\ncommunication overhead reduction. Extensive experiments evidence that the\nproposed task-oriented communication scheme achieves a better rate-relevance\ntradeoff than baseline methods.\n","authors":["Jiawei Shao","Yuyi Mao","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2109.00172v3.pdf","comment":"This paper was accepted to IEEE Transactions on Wireless\n Communication"},{"id":"http://arxiv.org/abs/2309.06127v1","updated":"2023-09-12T11:06:01Z","published":"2023-09-12T11:06:01Z","title":"Accelerating Edge AI with Morpher: An Integrated Design, Compilation and\n Simulation Framework for CGRAs","summary":" Coarse-Grained Reconfigurable Arrays (CGRAs) hold great promise as\npower-efficient edge accelerator, offering versatility beyond AI applications.\nMorpher, an open-source, architecture-adaptive CGRA design framework, is\nspecifically designed to explore the vast design space of CGRAs. The\ncomprehensive ecosystem of Morpher includes a tailored compiler, simulator,\naccelerator synthesis, and validation framework. This study provides an\noverview of Morpher, highlighting its capabilities in automatically compiling\nAI application kernels onto user-defined CGRA architectures and verifying their\nfunctionality. Through the Morpher framework, the versatility of CGRAs is\nharnessed to facilitate efficient compilation and verification of edge AI\napplications, covering important kernels representative of a wide range of\nembedded AI workloads. Morpher is available online at\nhttps://github.com/ecolab-nus/morpher-v2.\n","authors":["Dhananjaya Wijerathne","Zhaoying Li","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2309.06127v1.pdf","comment":"This work was accepted by the Workshop on Compilers, Deployment, and\n Tooling for Edge AI (CODAI 2023), co-hosted at Embedded Systems Week on\n September 21st, 2023"},{"id":"http://arxiv.org/abs/2309.06126v1","updated":"2023-09-12T11:02:27Z","published":"2023-09-12T11:02:27Z","title":"AstroLLaMA: Towards Specialized Foundation Models in Astronomy","summary":" Large language models excel in many human-language tasks but often falter in\nhighly specialized domains like scholarly astronomy. To bridge this gap, we\nintroduce AstroLLaMA, a 7-billion-parameter model fine-tuned from LLaMA-2 using\nover 300,000 astronomy abstracts from arXiv. Optimized for traditional causal\nlanguage modeling, AstroLLaMA achieves a 30% lower perplexity than Llama-2,\nshowing marked domain adaptation. Our model generates more insightful and\nscientifically relevant text completions and embedding extraction than\nstate-of-the-arts foundation models despite having significantly fewer\nparameters. AstroLLaMA serves as a robust, domain-specific model with broad\nfine-tuning potential. Its public release aims to spur astronomy-focused\nresearch, including automatic paper summarization and conversational agent\ndevelopment.\n","authors":["Tuan Dung Nguyen","Yuan-Sen Ting","Ioana Ciucă","Charlie O'Neill","Ze-Chang Sun","Maja Jabłońska","Sandor Kruk","Ernest Perkowski","Jack Miller","Jason Li","Josh Peek","Kartheik Iyer","Tomasz Różański","Pranav Khetarpal","Sharaf Zaman","David Brodrick","Sergio J. Rodríguez Méndez","Thang Bui","Alyssa Goodman","Alberto Accomazzi","Jill Naiman","Jesse Cranney","Kevin Schawinski"," UniverseTBD"],"pdf_url":"https://arxiv.org/pdf/2309.06126v1.pdf","comment":"6 pages, 3 figures, submitted to IJCNLP-AACL 2023. Comments are\n welcome. The model can be found on Hugging Face -\n https://huggingface.co/universeTBD/astrollama"},{"id":"http://arxiv.org/abs/2309.03241v2","updated":"2023-09-12T11:01:25Z","published":"2023-09-06T06:18:16Z","title":"GPT Can Solve Mathematical Problems Without a Calculator","summary":" Previous studies have typically assumed that large language models are unable\nto accurately perform arithmetic operations, particularly multiplication of >8\ndigits, and operations involving decimals and fractions, without the use of\ncalculator tools. This paper aims to challenge this misconception. With\nsufficient training data, a 2 billion-parameter language model can accurately\nperform multi-digit arithmetic operations with almost 100% accuracy without\ndata leakage, significantly surpassing GPT-4 (whose multi-digit multiplication\naccuracy is only 4.3%). We also demonstrate that our MathGLM, fine-tuned from\nGLM-10B on a dataset with additional multi-step arithmetic operations and math\nproblems described in text, achieves similar performance to GPT-4 on a\n5,000-samples Chinese math problem test set. Our code and data are public at\nhttps://github.com/THUDM/MathGLM.\n","authors":["Zhen Yang","Ming Ding","Qingsong Lv","Zhihuan Jiang","Zehai He","Yuyi Guo","Jinfeng Bai","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2309.03241v2.pdf","comment":"26pages,14figures"},{"id":"http://arxiv.org/abs/2309.04616v2","updated":"2023-09-12T10:45:10Z","published":"2023-09-08T22:13:03Z","title":"Knowledge Distillation-Empowered Digital Twin for Anomaly Detection","summary":" Cyber-physical systems (CPSs), like train control and management systems\n(TCMS), are becoming ubiquitous in critical infrastructures. As safety-critical\nsystems, ensuring their dependability during operation is crucial. Digital\ntwins (DTs) have been increasingly studied for this purpose owing to their\ncapability of runtime monitoring and warning, prediction and detection of\nanomalies, etc. However, constructing a DT for anomaly detection in TCMS\nnecessitates sufficient training data and extracting both chronological and\ncontext features with high quality. Hence, in this paper, we propose a novel\nmethod named KDDT for TCMS anomaly detection. KDDT harnesses a language model\n(LM) and a long short-term memory (LSTM) network to extract contexts and\nchronological features, respectively. To enrich data volume, KDDT benefits from\nout-of-domain data with knowledge distillation (KD). We evaluated KDDT with two\ndatasets from our industry partner Alstom and obtained the F1 scores of 0.931\nand 0.915, respectively, demonstrating the effectiveness of KDDT. We also\nexplored individual contributions of the DT model, LM, and KD to the overall\nperformance of KDDT, via a comprehensive empirical study, and observed average\nF1 score improvements of 12.4%, 3%, and 6.05%, respectively.\n","authors":["Qinghua Xu","Shaukat Ali","Tao Yue","Zaimovic Nedim","Inderjeet Singh"],"pdf_url":"https://arxiv.org/pdf/2309.04616v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06122v1","updated":"2023-09-12T10:44:15Z","published":"2023-09-12T10:44:15Z","title":"A robust synthetic data generation framework for machine learning in\n High-Resolution Transmission Electron Microscopy (HRTEM)","summary":" Machine learning techniques are attractive options for developing\nhighly-accurate automated analysis tools for nanomaterials characterization,\nincluding high-resolution transmission electron microscopy (HRTEM). However,\nsuccessfully implementing such machine learning tools can be difficult due to\nthe challenges in procuring sufficiently large, high-quality training datasets\nfrom experiments. In this work, we introduce Construction Zone, a Python\npackage for rapidly generating complex nanoscale atomic structures, and develop\nan end-to-end workflow for creating large simulated databases for training\nneural networks. Construction Zone enables fast, systematic sampling of\nrealistic nanomaterial structures, and can be used as a random structure\ngenerator for simulated databases, which is important for generating large,\ndiverse synthetic datasets. Using HRTEM imaging as an example, we train a\nseries of neural networks on various subsets of our simulated databases to\nsegment nanoparticles and holistically study the data curation process to\nunderstand how various aspects of the curated simulated data -- including\nsimulation fidelity, the distribution of atomic structures, and the\ndistribution of imaging conditions -- affect model performance across several\nexperimental benchmarks. Using our results, we are able to achieve\nstate-of-the-art segmentation performance on experimental HRTEM images of\nnanoparticles from several experimental benchmarks and, further, we discuss\nrobust strategies for consistently achieving high performance with machine\nlearning in experimental settings using purely synthetic data.\n","authors":["Luis Rangel DaCosta","Katherine Sytwu","Catherine Groschner","Mary Scott"],"pdf_url":"https://arxiv.org/pdf/2309.06122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03239v2","updated":"2023-09-12T10:19:45Z","published":"2023-09-06T02:51:24Z","title":"Spatio-Temporal Contrastive Self-Supervised Learning for POI-level Crowd\n Flow Inference","summary":" Accurate acquisition of crowd flow at Points of Interest (POIs) is pivotal\nfor effective traffic management, public service, and urban planning. Despite\nthis importance, due to the limitations of urban sensing techniques, the data\nquality from most sources is inadequate for monitoring crowd flow at each POI.\nThis renders the inference of accurate crowd flow from low-quality data a\ncritical and challenging task. The complexity is heightened by three key\nfactors: 1) The scarcity and rarity of labeled data, 2) The intricate\nspatio-temporal dependencies among POIs, and 3) The myriad correlations between\nprecise crowd flow and GPS reports.\n To address these challenges, we recast the crowd flow inference problem as a\nself-supervised attributed graph representation learning task and introduce a\nnovel Contrastive Self-learning framework for Spatio-Temporal data (CSST). Our\napproach initiates with the construction of a spatial adjacency graph founded\non the POIs and their respective distances. We then employ a contrastive\nlearning technique to exploit large volumes of unlabeled spatio-temporal data.\nWe adopt a swapped prediction approach to anticipate the representation of the\ntarget subgraph from similar instances. Following the pre-training phase, the\nmodel is fine-tuned with accurate crowd flow data. Our experiments, conducted\non two real-world datasets, demonstrate that the CSST pre-trained on extensive\nnoisy data consistently outperforms models trained from scratch.\n","authors":["Songyu Ke","Ting Li","Li Song","Yanping Sun","Qintian Sun","Junbo Zhang","Yu Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.03239v2.pdf","comment":"18 pages; submitted to TKDD;"},{"id":"http://arxiv.org/abs/2309.06097v1","updated":"2023-09-12T10:03:32Z","published":"2023-09-12T10:03:32Z","title":"Fidelity-Induced Interpretable Policy Extraction for Reinforcement\n Learning","summary":" Deep Reinforcement Learning (DRL) has achieved remarkable success in\nsequential decision-making problems. However, existing DRL agents make\ndecisions in an opaque fashion, hindering the user from establishing trust and\nscrutinizing weaknesses of the agents. While recent research has developed\nInterpretable Policy Extraction (IPE) methods for explaining how an agent takes\nactions, their explanations are often inconsistent with the agent's behavior\nand thus, frequently fail to explain. To tackle this issue, we propose a novel\nmethod, Fidelity-Induced Policy Extraction (FIPE). Specifically, we start by\nanalyzing the optimization mechanism of existing IPE methods, elaborating on\nthe issue of ignoring consistency while increasing cumulative rewards. We then\ndesign a fidelity-induced mechanism by integrate a fidelity measurement into\nthe reinforcement learning feedback. We conduct experiments in the complex\ncontrol environment of StarCraft II, an arena typically avoided by current IPE\nmethods. The experiment results demonstrate that FIPE outperforms the baselines\nin terms of interaction performance and consistency, meanwhile easy to\nunderstand.\n","authors":["Xiao Liu","Wubing Chen","Mao Tan"],"pdf_url":"https://arxiv.org/pdf/2309.06097v1.pdf","comment":"10 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2302.03358v2","updated":"2023-09-12T09:49:37Z","published":"2023-02-07T10:04:52Z","title":"Deep-OSG: Deep Learning of Operators in Semigroup","summary":" This paper proposes a novel deep learning approach for learning operators in\nsemigroup, with applications to modeling unknown autonomous dynamical systems\nusing time series data collected at varied time lags. It is a sequel to the\nprevious flow map learning (FML) works [T. Qin, K. Wu, and D. Xiu, J. Comput.\nPhys., 395:620--635, 2019], [K. Wu and D. Xiu, J. Comput. Phys., 408:109307,\n2020], and [Z. Chen, V. Churchill, K. Wu, and D. Xiu, J. Comput. Phys.,\n449:110782, 2022], which focused on learning single evolution operator with a\nfixed time step. This paper aims to learn a family of evolution operators with\nvariable time steps, which constitute a semigroup for an autonomous system. The\nsemigroup property is very crucial and links the system's evolutionary\nbehaviors across varying time scales, but it was not considered in the previous\nworks. We propose for the first time a framework of embedding the semigroup\nproperty into the data-driven learning process, through a novel neural network\narchitecture and new loss functions. The framework is very feasible, can be\ncombined with any suitable neural networks, and is applicable to learning\ngeneral autonomous ODEs and PDEs. We present the rigorous error estimates and\nvariance analysis to understand the prediction accuracy and robustness of our\napproach, showing the remarkable advantages of semigroup awareness in our\nmodel. Moreover, our approach allows one to arbitrarily choose the time steps\nfor prediction and ensures that the predicted results are well self-matched and\nconsistent. Extensive numerical experiments demonstrate that embedding the\nsemigroup property notably reduces the data dependency of deep learning models\nand greatly improves the accuracy, robustness, and stability for long-time\nprediction.\n","authors":["Junfeng Chen","Kailiang Wu"],"pdf_url":"https://arxiv.org/pdf/2302.03358v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16691v3","updated":"2023-09-12T09:39:42Z","published":"2022-11-30T02:24:42Z","title":"Computationally Efficient Reinforcement Learning: Targeted Exploration\n leveraging Simple Rules","summary":" Model-free Reinforcement Learning (RL) generally suffers from poor sample\ncomplexity, mostly due to the need to exhaustively explore the state-action\nspace to find well-performing policies. On the other hand, we postulate that\nexpert knowledge of the system often allows us to design simple rules we expect\ngood policies to follow at all times. In this work, we hence propose a simple\nyet effective modification of continuous actor-critic frameworks to incorporate\nsuch rules and avoid regions of the state-action space that are known to be\nsuboptimal, thereby significantly accelerating the convergence of RL agents.\nConcretely, we saturate the actions chosen by the agent if they do not comply\nwith our intuition and, critically, modify the gradient update step of the\npolicy to ensure the learning process is not affected by the saturation step.\nOn a room temperature control case study, it allows agents to converge to\nwell-performing policies up to 6-7x faster than classical agents without\ncomputational overhead and while retaining good final performance.\n","authors":["Loris Di Natale","Bratislav Svetozarevic","Philipp Heer","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2211.16691v3.pdf","comment":"Accepted to CDC 2023"},{"id":"http://arxiv.org/abs/2308.16139v3","updated":"2023-09-12T09:37:47Z","published":"2023-08-30T16:52:20Z","title":"MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer\n Vision","summary":" We present MedShapeNet, a large collection of anatomical shapes (e.g., bones,\norgans, vessels) and 3D surgical instrument models. Prior to the deep learning\nera, the broad application of statistical shape models (SSMs) in medical image\nanalysis is evidence that shapes have been commonly used to describe medical\ndata. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in\nmedical imaging are predominantly voxel-based. In computer vision, on the\ncontrary, shapes (including, voxel occupancy grids, meshes, point clouds and\nimplicit surface models) are preferred data representations in 3D, as seen from\nthe numerous shape-related publications in premier vision conferences, such as\nthe IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as\nwell as the increasing popularity of ShapeNet (about 51,300 models) and\nPrinceton ModelNet (127,915 models) in computer vision research. MedShapeNet is\ncreated as an alternative to these commonly used shape benchmarks to facilitate\nthe translation of data-driven vision algorithms to medical applications, and\nit extends the opportunities to adapt SOTA vision algorithms to solve critical\nmedical problems. Besides, the majority of the medical shapes in MedShapeNet\nare modeled directly on the imaging data of real patients, and therefore it\ncomplements well existing shape benchmarks comprising of computer-aided design\n(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes,\nand provides annotations in the form of paired data. It is therefore also a\nfreely available repository of 3D models for extended reality (virtual reality\n- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This\nwhite paper describes in detail the motivations behind MedShapeNet, the shape\nacquisition procedures, the use cases, as well as the usage of the online shape\nsearch portal: https://medshapenet.ikim.nrw/\n","authors":["Jianning Li","Antonio Pepe","Christina Gsaxner","Gijs Luijten","Yuan Jin","Narmada Ambigapathy","Enrico Nasca","Naida Solak","Gian Marco Melito","Viet Duc Vu","Afaque R. Memon","Xiaojun Chen","Jan Stefan Kirschke","Ezequiel de la Rosa","Patrick Ferdinand Christ","Hongwei Bran Li","David G. Ellis","Michele R. Aizenberg","Sergios Gatidis","Thomas Küstner","Nadya Shusharina","Nicholas Heller","Vincent Andrearczyk","Adrien Depeursinge","Mathieu Hatt","Anjany Sekuboyina","Maximilian Löffler","Hans Liebl","Reuben Dorent","Tom Vercauteren","Jonathan Shapey","Aaron Kujawa","Stefan Cornelissen","Patrick Langenhuizen","Achraf Ben-Hamadou","Ahmed Rekik","Sergi Pujades","Edmond Boyer","Federico Bolelli","Costantino Grana","Luca Lumetti","Hamidreza Salehi","Jun Ma","Yao Zhang","Ramtin Gharleghi","Susann Beier","Arcot Sowmya","Eduardo A. Garza-Villarreal","Thania Balducci","Diego Angeles-Valdez","Roberto Souza","Leticia Rittner","Richard Frayne","Yuanfeng Ji","Soumick Chatterjee","Florian Dubost","Stefanie Schreiber","Hendrik Mattern","Oliver Speck","Daniel Haehn","Christoph John","Andreas Nürnberger","João Pedrosa","Carlos Ferreira","Guilherme Aresta","António Cunha","Aurélio Campilho","Yannick Suter","Jose Garcia","Alain Lalande","Emmanuel Audenaert","Claudia Krebs","Timo Van Leeuwen","Evie Vereecke","Rainer Röhrig","Frank Hölzle","Vahid Badeli","Kathrin Krieger","Matthias Gunzer","Jianxu Chen","Amin Dada","Miriam Balzer","Jana Fragemann","Frederic Jonske","Moritz Rempe","Stanislav Malorodov","Fin H. Bahnsen","Constantin Seibold","Alexander Jaus","Ana Sofia Santos","Mariana Lindo","André Ferreira","Victor Alves","Michael Kamp","Amr Abourayya","Felix Nensa","Fabian Hörst","Alexander Brehmer","Lukas Heine","Lars E. Podleska","Matthias A. Fink","Julius Keyl","Konstantinos Tserpes","Moon-Sung Kim","Shireen Elhabian","Hans Lamecker","Dženan Zukić","Beatriz Paniagua","Christian Wachinger","Martin Urschler","Luc Duong","Jakob Wasserthal","Peter F. Hoyer","Oliver Basu","Thomas Maal","Max J. H. Witjes","Ti-chiun Chang","Seyed-Ahmad Ahmadi","Ping Luo","Bjoern Menze","Mauricio Reyes","Christos Davatzikos","Behrus Puladi","Jens Kleesiek","Jan Egger"],"pdf_url":"https://arxiv.org/pdf/2308.16139v3.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2309.06090v1","updated":"2023-09-12T09:37:26Z","published":"2023-09-12T09:37:26Z","title":"A General Verification Framework for Dynamical and Control Models via\n Certificate Synthesis","summary":" An emerging branch of control theory specialises in certificate learning,\nconcerning the specification of a desired (possibly complex) system behaviour\nfor an autonomous or control model, which is then analytically verified by\nmeans of a function-based proof. However, the synthesis of controllers abiding\nby these complex requirements is in general a non-trivial task and may elude\nthe most expert control engineers. This results in a need for automatic\ntechniques that are able to design controllers and to analyse a wide range of\nelaborate specifications. In this paper, we provide a general framework to\nencode system specifications and define corresponding certificates, and we\npresent an automated approach to formally synthesise controllers and\ncertificates. Our approach contributes to the broad field of safe learning for\ncontrol, exploiting the flexibility of neural networks to provide candidate\ncontrol and certificate functions, whilst using SMT-solvers to offer a formal\nguarantee of correctness. We test our framework by developing a prototype\nsoftware tool, and assess its efficacy at verification via control and\ncertificate synthesis over a large and varied suite of benchmarks.\n","authors":["Alec Edwards","Andrea Peruffo","Alessandro Abate"],"pdf_url":"https://arxiv.org/pdf/2309.06090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06089v1","updated":"2023-09-12T09:37:08Z","published":"2023-09-12T09:37:08Z","title":"Measuring Catastrophic Forgetting in Cross-Lingual Transfer Paradigms:\n Exploring Tuning Strategies","summary":" The cross-lingual transfer is a promising technique to solve tasks in\nless-resourced languages. In this empirical study, we compare two fine-tuning\napproaches combined with zero-shot and full-shot learning approaches for large\nlanguage models in a cross-lingual setting. As fine-tuning strategies, we\ncompare parameter-efficient adapter methods with fine-tuning of all parameters.\nAs cross-lingual transfer strategies, we compare the intermediate-training\n(\\textit{IT}) that uses each language sequentially and cross-lingual validation\n(\\textit{CLV}) that uses a target language already in the validation phase of\nfine-tuning. We assess the success of transfer and the extent of catastrophic\nforgetting in a source language due to cross-lingual transfer, i.e., how much\npreviously acquired knowledge is lost when we learn new information in a\ndifferent language. The results on two different classification problems, hate\nspeech detection and product reviews, each containing datasets in several\nlanguages, show that the \\textit{IT} cross-lingual strategy outperforms\n\\textit{CLV} for the target language. Our findings indicate that, in the\nmajority of cases, the \\textit{CLV} strategy demonstrates superior retention of\nknowledge in the base language (English) compared to the \\textit{IT} strategy,\nwhen evaluating catastrophic forgetting in multiple cross-lingual transfers.\n","authors":["Boshko Koloski","Blaž Škrlj","Marko Robnik-Šikonja","Senja Pollak"],"pdf_url":"https://arxiv.org/pdf/2309.06089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06086v1","updated":"2023-09-12T09:31:34Z","published":"2023-09-12T09:31:34Z","title":"Plasticity-Optimized Complementary Networks for Unsupervised Continual\n Learning","summary":" Continuous unsupervised representation learning (CURL) research has greatly\nbenefited from improvements in self-supervised learning (SSL) techniques. As a\nresult, existing CURL methods using SSL can learn high-quality representations\nwithout any labels, but with a notable performance drop when learning on a\nmany-tasks data stream. We hypothesize that this is caused by the\nregularization losses that are imposed to prevent forgetting, leading to a\nsuboptimal plasticity-stability trade-off: they either do not adapt fully to\nthe incoming data (low plasticity), or incur significant forgetting when\nallowed to fully adapt to a new SSL pretext-task (low stability). In this work,\nwe propose to train an expert network that is relieved of the duty of keeping\nthe previous knowledge and can focus on performing optimally on the new tasks\n(optimizing plasticity). In the second phase, we combine this new knowledge\nwith the previous network in an adaptation-retrospection phase to avoid\nforgetting and initialize a new expert with the knowledge of the old network.\nWe perform several experiments showing that our proposed approach outperforms\nother CURL exemplar-free methods in few- and many-task split settings.\nFurthermore, we show how to adapt our approach to semi-supervised continual\nlearning (Semi-SCL) and show that we surpass the accuracy of other\nexemplar-free Semi-SCL methods and reach the results of some others that use\nexemplars.\n","authors":["Alex Gomez-Villa","Bartlomiej Twardowski","Kai Wang","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2309.06086v1.pdf","comment":"Accepted at WACV2024"},{"id":"http://arxiv.org/abs/2309.06082v1","updated":"2023-09-12T09:24:21Z","published":"2023-09-12T09:24:21Z","title":"A Machine Learning Framework to Deconstruct the Primary Drivers for\n Electricity Market Price Events","summary":" Power grids are moving towards 100% renewable energy source bulk power grids,\nand the overall dynamics of power system operations and electricity markets are\nchanging. The electricity markets are not only dispatching resources\neconomically but also taking into account various controllable actions like\nrenewable curtailment, transmission congestion mitigation, and energy storage\noptimization to ensure grid reliability. As a result, price formations in\nelectricity markets have become quite complex. Traditional root cause analysis\nand statistical approaches are rendered inapplicable to analyze and infer the\nmain drivers behind price formation in the modern grid and markets with\nvariable renewable energy (VRE). In this paper, we propose a machine\nlearning-based analysis framework to deconstruct the primary drivers for price\nspike events in modern electricity markets with high renewable energy. The\noutcomes can be utilized for various critical aspects of market design,\nrenewable dispatch and curtailment, operations, and cyber-security\napplications. The framework can be applied to any ISO or market data; however,\nin this paper, it is applied to open-source publicly available datasets from\nCalifornia Independent System Operator (CAISO) and ISO New England (ISO-NE).\n","authors":["Milan Jain","Xueqing Sun","Sohom Datta","Abhishek Somani"],"pdf_url":"https://arxiv.org/pdf/2309.06082v1.pdf","comment":"Published in IEEE PES GM 2023"},{"id":"http://arxiv.org/abs/2212.10229v4","updated":"2023-09-12T09:23:39Z","published":"2022-12-20T13:07:20Z","title":"StyleDomain: Efficient and Lightweight Parameterizations of StyleGAN for\n One-shot and Few-shot Domain Adaptation","summary":" Domain adaptation of GANs is a problem of fine-tuning GAN models pretrained\non a large dataset (e.g. StyleGAN) to a specific domain with few samples (e.g.\npainting faces, sketches, etc.). While there are many methods that tackle this\nproblem in different ways, there are still many important questions that remain\nunanswered. In this paper, we provide a systematic and in-depth analysis of the\ndomain adaptation problem of GANs, focusing on the StyleGAN model. We perform a\ndetailed exploration of the most important parts of StyleGAN that are\nresponsible for adapting the generator to a new domain depending on the\nsimilarity between the source and target domains. As a result of this study, we\npropose new efficient and lightweight parameterizations of StyleGAN for domain\nadaptation. Particularly, we show that there exist directions in StyleSpace\n(StyleDomain directions) that are sufficient for adapting to similar domains.\nFor dissimilar domains, we propose Affine+ and AffineLight+ parameterizations\nthat allows us to outperform existing baselines in few-shot adaptation while\nhaving significantly less training parameters. Finally, we examine StyleDomain\ndirections and discover their many surprising properties that we apply for\ndomain mixing and cross-domain image morphing. Source code can be found at\nhttps://github.com/AIRI-Institute/StyleDomain.\n","authors":["Aibek Alanov","Vadim Titov","Maksim Nakhodnov","Dmitry Vetrov"],"pdf_url":"https://arxiv.org/pdf/2212.10229v4.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2308.13991v2","updated":"2023-09-12T09:22:04Z","published":"2023-08-27T02:59:59Z","title":"JL-lemma derived Optimal Projections for Discriminative Dictionary\n Learning","summary":" To overcome difficulties in classifying large dimensionality data with a\nlarge number of classes, we propose a novel approach called JLSPCADL. This\npaper uses the Johnson-Lindenstrauss (JL) Lemma to select the dimensionality of\na transformed space in which a discriminative dictionary can be learned for\nsignal classification. Rather than reducing dimensionality via random\nprojections, as is often done with JL, we use a projection transformation\nmatrix derived from Modified Supervised PC Analysis (M-SPCA) with the\nJL-prescribed dimension.\n JLSPCADL provides a heuristic to deduce suitable distortion levels and the\ncorresponding Suitable Description Length (SDL) of dictionary atoms to derive\nan optimal feature space and thus the SDL of dictionary atoms for better\nclassification. Unlike state-of-the-art dimensionality reduction-based\ndictionary learning methods, a projection transformation matrix derived in a\nsingle step from M-SPCA provides maximum feature-label consistency of the\ntransformed space while preserving the cluster structure of the original data.\nDespite confusing pairs, the dictionary for the transformed space generates\ndiscriminative sparse coefficients, with fewer training samples.\nExperimentation demonstrates that JLSPCADL scales well with an increasing\nnumber of classes and dimensionality. Improved label consistency of features\ndue to M-SPCA helps to classify better. Further, the complexity of training a\ndiscriminative dictionary is significantly reduced by using SDL.\nExperimentation on OCR and face recognition datasets shows relatively better\nclassification performance than other supervised dictionary learning\nalgorithms.\n","authors":["G. Madhuri","Atul Negi","Kaluri V. Rangarao"],"pdf_url":"https://arxiv.org/pdf/2308.13991v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06081v1","updated":"2023-09-12T09:18:12Z","published":"2023-09-12T09:18:12Z","title":"Information Flow in Graph Neural Networks: A Clinical Triage Use Case","summary":" Graph Neural Networks (GNNs) have gained popularity in healthcare and other\ndomains due to their ability to process multi-modal and multi-relational\ngraphs. However, efficient training of GNNs remains challenging, with several\nopen research questions. In this paper, we investigate how the flow of\nembedding information within GNNs affects the prediction of links in Knowledge\nGraphs (KGs). Specifically, we propose a mathematical model that decouples the\nGNN connectivity from the connectivity of the graph data and evaluate the\nperformance of GNNs in a clinical triage use case. Our results demonstrate that\nincorporating domain knowledge into the GNN connectivity leads to better\nperformance than using the same connectivity as the KG or allowing\nunconstrained embedding propagation. Moreover, we show that negative edges play\na crucial role in achieving good predictions, and that using too many GNN\nlayers can degrade performance.\n","authors":["Víctor Valls","Mykhaylo Zayats","Alessandra Pascale"],"pdf_url":"https://arxiv.org/pdf/2309.06081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08568v2","updated":"2023-09-12T09:16:46Z","published":"2022-12-16T16:44:46Z","title":"Biomedical image analysis competitions: The state of current\n participation practice","summary":" The number of international benchmarking competitions is steadily increasing\nin various fields of machine learning (ML) research and practice. So far,\nhowever, little is known about the common practice as well as bottlenecks faced\nby the community in tackling the research questions posed. To shed light on the\nstatus quo of algorithm development in the specific field of biomedical imaging\nanalysis, we designed an international survey that was issued to all\nparticipants of challenges conducted in conjunction with the IEEE ISBI 2021 and\nMICCAI 2021 conferences (80 competitions in total). The survey covered\nparticipants' expertise and working environments, their chosen strategies, as\nwell as algorithm characteristics. A median of 72% challenge participants took\npart in the survey. According to our results, knowledge exchange was the\nprimary incentive (70%) for participation, while the reception of prize money\nplayed only a minor role (16%). While a median of 80 working hours was spent on\nmethod development, a large portion of participants stated that they did not\nhave enough time for method development (32%). 25% perceived the infrastructure\nto be a bottleneck. Overall, 94% of all solutions were deep learning-based. Of\nthese, 84% were based on standard architectures. 43% of the respondents\nreported that the data samples (e.g., images) were too large to be processed at\nonce. This was most commonly addressed by patch-based training (69%),\ndownsampling (37%), and solving 3D analysis tasks as a series of 2D tasks.\nK-fold cross-validation on the training set was performed by only 37% of the\nparticipants and only 50% of the participants performed ensembling based on\nmultiple identical models (61%) or heterogeneous models (39%). 48% of the\nrespondents applied postprocessing steps.\n","authors":["Matthias Eisenmann","Annika Reinke","Vivienn Weru","Minu Dietlinde Tizabi","Fabian Isensee","Tim J. Adler","Patrick Godau","Veronika Cheplygina","Michal Kozubek","Sharib Ali","Anubha Gupta","Jan Kybic","Alison Noble","Carlos Ortiz de Solórzano","Samiksha Pachade","Caroline Petitjean","Daniel Sage","Donglai Wei","Elizabeth Wilden","Deepak Alapatt","Vincent Andrearczyk","Ujjwal Baid","Spyridon Bakas","Niranjan Balu","Sophia Bano","Vivek Singh Bawa","Jorge Bernal","Sebastian Bodenstedt","Alessandro Casella","Jinwook Choi","Olivier Commowick","Marie Daum","Adrien Depeursinge","Reuben Dorent","Jan Egger","Hannah Eichhorn","Sandy Engelhardt","Melanie Ganz","Gabriel Girard","Lasse Hansen","Mattias Heinrich","Nicholas Heller","Alessa Hering","Arnaud Huaulmé","Hyunjeong Kim","Bennett Landman","Hongwei Bran Li","Jianning Li","Jun Ma","Anne Martel","Carlos Martín-Isla","Bjoern Menze","Chinedu Innocent Nwoye","Valentin Oreiller","Nicolas Padoy","Sarthak Pati","Kelly Payette","Carole Sudre","Kimberlin van Wijnen","Armine Vardazaryan","Tom Vercauteren","Martin Wagner","Chuanbo Wang","Moi Hoon Yap","Zeyun Yu","Chun Yuan","Maximilian Zenk","Aneeq Zia","David Zimmerer","Rina Bao","Chanyeol Choi","Andrew Cohen","Oleh Dzyubachyk","Adrian Galdran","Tianyuan Gan","Tianqi Guo","Pradyumna Gupta","Mahmood Haithami","Edward Ho","Ikbeom Jang","Zhili Li","Zhengbo Luo","Filip Lux","Sokratis Makrogiannis","Dominik Müller","Young-tack Oh","Subeen Pang","Constantin Pape","Gorkem Polat","Charlotte Rosalie Reed","Kanghyun Ryu","Tim Scherr","Vajira Thambawita","Haoyu Wang","Xinliang Wang","Kele Xu","Hung Yeh","Doyeob Yeo","Yixuan Yuan","Yan Zeng","Xin Zhao","Julian Abbing","Jannes Adam","Nagesh Adluru","Niklas Agethen","Salman Ahmed","Yasmina Al Khalil","Mireia Alenyà","Esa Alhoniemi","Chengyang An","Talha Anwar","Tewodros Weldebirhan Arega","Netanell Avisdris","Dogu Baran Aydogan","Yingbin Bai","Maria Baldeon Calisto","Berke Doga Basaran","Marcel Beetz","Cheng Bian","Hao Bian","Kevin Blansit","Louise Bloch","Robert Bohnsack","Sara Bosticardo","Jack Breen","Mikael Brudfors","Raphael Brüngel","Mariano Cabezas","Alberto Cacciola","Zhiwei Chen","Yucong Chen","Daniel Tianming Chen","Minjeong Cho","Min-Kook Choi","Chuantao Xie Chuantao Xie","Dana Cobzas","Julien Cohen-Adad","Jorge Corral Acero","Sujit Kumar Das","Marcela de Oliveira","Hanqiu Deng","Guiming Dong","Lars Doorenbos","Cory Efird","Sergio Escalera","Di Fan","Mehdi Fatan Serj","Alexandre Fenneteau","Lucas Fidon","Patryk Filipiak","René Finzel","Nuno R. Freitas","Christoph M. Friedrich","Mitchell Fulton","Finn Gaida","Francesco Galati","Christoforos Galazis","Chang Hee Gan","Zheyao Gao","Shengbo Gao","Matej Gazda","Beerend Gerats","Neil Getty","Adam Gibicar","Ryan Gifford","Sajan Gohil","Maria Grammatikopoulou","Daniel Grzech","Orhun Güley","Timo Günnemann","Chunxu Guo","Sylvain Guy","Heonjin Ha","Luyi Han","Il Song Han","Ali Hatamizadeh","Tian He","Jimin Heo","Sebastian Hitziger","SeulGi Hong","SeungBum Hong","Rian Huang","Ziyan Huang","Markus Huellebrand","Stephan Huschauer","Mustaffa Hussain","Tomoo Inubushi","Ece Isik Polat","Mojtaba Jafaritadi","SeongHun Jeong","Bailiang Jian","Yuanhong Jiang","Zhifan Jiang","Yueming Jin","Smriti Joshi","Abdolrahim Kadkhodamohammadi","Reda Abdellah Kamraoui","Inha Kang","Junghwa Kang","Davood Karimi","April Khademi","Muhammad Irfan Khan","Suleiman A. Khan","Rishab Khantwal","Kwang-Ju Kim","Timothy Kline","Satoshi Kondo","Elina Kontio","Adrian Krenzer","Artem Kroviakov","Hugo Kuijf","Satyadwyoom Kumar","Francesco La Rosa","Abhi Lad","Doohee Lee","Minho Lee","Chiara Lena","Hao Li","Ling Li","Xingyu Li","Fuyuan Liao","KuanLun Liao","Arlindo Limede Oliveira","Chaonan Lin","Shan Lin","Akis Linardos","Marius George Linguraru","Han Liu","Tao Liu","Di Liu","Yanling Liu","João Lourenço-Silva","Jingpei Lu","Jiangshan Lu","Imanol Luengo","Christina B. Lund","Huan Minh Luu","Yi Lv","Yi Lv","Uzay Macar","Leon Maechler","Sina Mansour L.","Kenji Marshall","Moona Mazher","Richard McKinley","Alfonso Medela","Felix Meissen","Mingyuan Meng","Dylan Miller","Seyed Hossein Mirjahanmardi","Arnab Mishra","Samir Mitha","Hassan Mohy-ud-Din","Tony Chi Wing Mok","Gowtham Krishnan Murugesan","Enamundram Naga Karthik","Sahil Nalawade","Jakub Nalepa","Mohamed Naser","Ramin Nateghi","Hammad Naveed","Quang-Minh Nguyen","Cuong Nguyen Quoc","Brennan Nichyporuk","Bruno Oliveira","David Owen","Jimut Bahan Pal","Junwen Pan","Wentao Pan","Winnie Pang","Bogyu Park","Vivek Pawar","Kamlesh Pawar","Michael Peven","Lena Philipp","Tomasz Pieciak","Szymon Plotka","Marcel Plutat","Fattaneh Pourakpour","Domen Preložnik","Kumaradevan Punithakumar","Abdul Qayyum","Sandro Queirós","Arman Rahmim","Salar Razavi","Jintao Ren","Mina Rezaei","Jonathan Adam Rico","ZunHyan Rieu","Markus Rink","Johannes Roth","Yusely Ruiz-Gonzalez","Numan Saeed","Anindo Saha","Mostafa Salem","Ricardo Sanchez-Matilla","Kurt Schilling","Wei Shao","Zhiqiang Shen","Ruize Shi","Pengcheng Shi","Daniel Sobotka","Théodore Soulier","Bella Specktor Fadida","Danail Stoyanov","Timothy Sum Hon Mun","Xiaowu Sun","Rong Tao","Franz Thaler","Antoine Théberge","Felix Thielke","Helena Torres","Kareem A. Wahid","Jiacheng Wang","YiFei Wang","Wei Wang","Xiong Wang","Jianhui Wen","Ning Wen","Marek Wodzinski","Ye Wu","Fangfang Xia","Tianqi Xiang","Chen Xiaofei","Lizhan Xu","Tingting Xue","Yuxuan Yang","Lin Yang","Kai Yao","Huifeng Yao","Amirsaeed Yazdani","Michael Yip","Hwanseung Yoo","Fereshteh Yousefirizi","Shunkai Yu","Lei Yu","Jonathan Zamora","Ramy Ashraf Zeineldin","Dewen Zeng","Jianpeng Zhang","Bokai Zhang","Jiapeng Zhang","Fan Zhang","Huahong Zhang","Zhongchen Zhao","Zixuan Zhao","Jiachen Zhao","Can Zhao","Qingshuo Zheng","Yuheng Zhi","Ziqi Zhou","Baosheng Zou","Klaus Maier-Hein","Paul F. Jäger","Annette Kopp-Schneider","Lena Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2212.08568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06075v1","updated":"2023-09-12T09:12:37Z","published":"2023-09-12T09:12:37Z","title":"A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel\n Segmentation via Two-Phase Training Angiography-to-Venography Translation","summary":" We present a semi-supervised domain adaptation framework for brain vessel\nsegmentation from different image modalities. Existing state-of-the-art methods\nfocus on a single modality, despite the wide range of available cerebrovascular\nimaging techniques. This can lead to significant distribution shifts that\nnegatively impact the generalization across modalities. By relying on annotated\nangiographies and a limited number of annotated venographies, our framework\naccomplishes image-to-image translation and semantic segmentation, leveraging a\ndisentangled and semantically rich latent space to represent heterogeneous data\nand perform image-level adaptation from source to target domains. Moreover, we\nreduce the typical complexity of cycle-based architectures and minimize the use\nof adversarial training, which allows us to build an efficient and intuitive\nmodel with stable training. We evaluate our method on magnetic resonance\nangiographies and venographies. While achieving state-of-the-art performance in\nthe source domain, our method attains a Dice score coefficient in the target\ndomain that is only 8.9% lower, highlighting its promising potential for robust\ncerebrovascular image segmentation across different modalities.\n","authors":["Francesco Galati","Daniele Falcetta","Rosa Cortese","Barbara Casolla","Ferran Prados","Ninon Burgos","Maria A. Zuluaga"],"pdf_url":"https://arxiv.org/pdf/2309.06075v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06062v1","updated":"2023-09-12T09:00:17Z","published":"2023-09-12T09:00:17Z","title":"Selection of contributing factors for predicting landslide\n susceptibility using machine learning and deep learning models","summary":" Landslides are a common natural disaster that can cause casualties, property\nsafety threats and economic losses. Therefore, it is important to understand or\npredict the probability of landslide occurrence at potentially risky sites. A\ncommonly used means is to carry out a landslide susceptibility assessment based\non a landslide inventory and a set of landslide contributing factors. This can\nbe readily achieved using machine learning (ML) models such as logistic\nregression (LR), support vector machine (SVM), random forest (RF), extreme\ngradient boosting (Xgboost), or deep learning (DL) models such as convolutional\nneural network (CNN) and long short time memory (LSTM). As the input data for\nthese models, landslide contributing factors have varying influences on\nlandslide occurrence. Therefore, it is logically feasible to select more\nimportant contributing factors and eliminate less relevant ones, with the aim\nof increasing the prediction accuracy of these models. However, selecting more\nimportant factors is still a challenging task and there is no generally\naccepted method. Furthermore, the effects of factor selection using various\nmethods on the prediction accuracy of ML and DL models are unclear. In this\nstudy, the impact of the selection of contributing factors on the accuracy of\nlandslide susceptibility predictions using ML and DL models was investigated.\nFour methods for selecting contributing factors were considered for all the\naforementioned ML and DL models, which included Information Gain Ratio (IGR),\nRecursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least\nAbsolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization\n(HHO). In addition, autoencoder-based factor selection methods for DL models\nwere also investigated. To assess their performances, an exhaustive approach\nwas adopted,...\n","authors":["Cheng Chen","Lei Fan"],"pdf_url":"https://arxiv.org/pdf/2309.06062v1.pdf","comment":"Stochastic Environmental Research and Risk Assessment"},{"id":"http://arxiv.org/abs/2309.06061v1","updated":"2023-09-12T09:00:03Z","published":"2023-09-12T09:00:03Z","title":"Verifiable Fairness: Privacy-preserving Computation of Fairness for\n Machine Learning Systems","summary":" Fair machine learning is a thriving and vibrant research topic. In this\npaper, we propose Fairness as a Service (FaaS), a secure, verifiable and\nprivacy-preserving protocol to computes and verify the fairness of any machine\nlearning (ML) model. In the deisgn of FaaS, the data and outcomes are\nrepresented through cryptograms to ensure privacy. Also, zero knowledge proofs\nguarantee the well-formedness of the cryptograms and underlying data. FaaS is\nmodel--agnostic and can support various fairness metrics; hence, it can be used\nas a service to audit the fairness of any ML model. Our solution requires no\ntrusted third party or private channels for the computation of the fairness\nmetric. The security guarantees and commitments are implemented in a way that\nevery step is securely transparent and verifiable from the start to the end of\nthe process. The cryptograms of all input data are publicly available for\neveryone, e.g., auditors, social activists and experts, to verify the\ncorrectness of the process. We implemented FaaS to investigate performance and\ndemonstrate the successful use of FaaS for a publicly available data set with\nthousands of entries.\n","authors":["Ehsan Toreini","Maryam Mehrnezhad","Aad van Moorsel"],"pdf_url":"https://arxiv.org/pdf/2309.06061v1.pdf","comment":"accepted in International Workshop on Private, Secure, and\n Trustworthy AI (PriST-AI), ESORICS'23 workshop"},{"id":"http://arxiv.org/abs/2309.06054v1","updated":"2023-09-12T08:45:25Z","published":"2023-09-12T08:45:25Z","title":"How does representation impact in-context learning: A exploration on a\n synthetic task","summary":" In-context learning, i.e., learning from in-context samples, is an impressive\nability of Transformer. However, the mechanism driving the in-context learning\nis not yet fully understood. In this study, we aim to investigate from an\nunderexplored perspective of representation learning. The representation is\nmore complex for in-context learning senario, where the representation can be\nimpacted by both model weights and in-context samples. We refer the above two\nconceptually aspects of representation as in-weight component and in-context\ncomponent, respectively. To study how the two components affect in-context\nlearning capabilities, we construct a novel synthetic task, making it possible\nto device two probes, in-weights probe and in-context probe, to evaluate the\ntwo components, respectively. We demonstrate that the goodness of in-context\ncomponent is highly related to the in-context learning performance, which\nindicates the entanglement between in-context learning and representation\nlearning. Furthermore, we find that a good in-weights component can actually\nbenefit the learning of the in-context component, indicating that in-weights\nlearning should be the foundation of in-context learning. To further understand\nthe the in-context learning mechanism and importance of the in-weights\ncomponent, we proof by construction that a simple Transformer, which uses\npattern matching and copy-past mechanism to perform in-context learning, can\nmatch the in-context learning performance with more complex, best tuned\nTransformer under the perfect in-weights component assumption. In short, those\ndiscoveries from representation learning perspective shed light on new\napproaches to improve the in-context capacity.\n","authors":["Jingwen Fu","Tao Yang","Yuwang Wang","Yan Lu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.06054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06049v1","updated":"2023-09-12T08:35:24Z","published":"2023-09-12T08:35:24Z","title":"A Perceptron-based Fine Approximation Technique for Linear Separation","summary":" This paper presents a novel online learning method that aims at finding a\nseparator hyperplane between data points labelled as either positive or\nnegative. Since weights and biases of artificial neurons can directly be\nrelated to hyperplanes in high-dimensional spaces, the technique is applicable\nto train perceptron-based binary classifiers in machine learning. In case of\nlarge or imbalanced data sets, use of analytical or gradient-based solutions\ncan become prohibitive and impractical, where heuristics and approximation\ntechniques are still applicable. The proposed method is based on the Perceptron\nalgorithm, however, it tunes neuron weights in just the necessary extent during\nsearching the separator hyperplane. Due to an appropriate transformation of the\ninitial data set we need not to consider data labels, neither the bias term.\nrespectively, reducing separability to a one-class classification problem. The\npresented method has proven converge; empirical results show that it can be\nmore efficient than the Perceptron algorithm, especially, when the size of the\ndata set exceeds data dimensionality.\n","authors":["Ákos Hajnal"],"pdf_url":"https://arxiv.org/pdf/2309.06049v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2211.12875v4","updated":"2023-09-12T08:34:18Z","published":"2022-11-23T11:31:11Z","title":"A Survey of Deep Graph Clustering: Taxonomy, Challenge, Application, and\n Open Resource","summary":" Graph clustering, which aims to divide nodes in the graph into several\ndistinct clusters, is a fundamental yet challenging task. Benefiting from the\npowerful representation capability of deep learning, deep graph clustering\nmethods have achieved great success in recent years. However, the corresponding\nsurvey paper is relatively scarce, and it is imminent to make a summary of this\nfield. From this motivation, we conduct a comprehensive survey of deep graph\nclustering. Firstly, we introduce formulaic definition, evaluation, and\ndevelopment in this field. Secondly, the taxonomy of deep graph clustering\nmethods is presented based on four different criteria, including graph type,\nnetwork architecture, learning paradigm, and clustering method. Thirdly, we\ncarefully analyze the existing methods via extensive experiments and summarize\nthe challenges and opportunities from five perspectives, including graph data\nquality, stability, scalability, discriminative capability, and unknown cluster\nnumber. Besides, the applications of deep graph clustering methods in six\ndomains, including computer vision, natural language processing, recommendation\nsystems, social network analyses, bioinformatics, and medical science, are\npresented. Last but not least, this paper provides open resource supports,\nincluding 1) a collection\n(\\url{https://github.com/yueliu1999/Awesome-Deep-Graph-Clustering}) of\nstate-of-the-art deep graph clustering methods (papers, codes, and datasets)\nand 2) a unified framework\n(\\url{https://github.com/Marigoldwu/A-Unified-Framework-for-Deep-Attribute-Graph-Clustering})\nof deep graph clustering. We hope this work can serve as a quick guide and help\nresearchers overcome challenges in this vibrant field.\n","authors":["Yue Liu","Jun Xia","Sihang Zhou","Xihong Yang","Ke Liang","Chenchen Fan","Yan Zhuang","Stan Z. Li","Xinwang Liu","Kunlun He"],"pdf_url":"https://arxiv.org/pdf/2211.12875v4.pdf","comment":"20 pages, 14 figures"},{"id":"http://arxiv.org/abs/2309.06046v1","updated":"2023-09-12T08:30:35Z","published":"2023-09-12T08:30:35Z","title":"BatMan-CLR: Making Few-shots Meta-Learners Resilient Against Label Noise","summary":" The negative impact of label noise is well studied in classical supervised\nlearning yet remains an open research question in meta-learning. Meta-learners\naim to adapt to unseen learning tasks by learning a good initial model in\nmeta-training and consecutively fine-tuning it according to new tasks during\nmeta-testing. In this paper, we present the first extensive analysis of the\nimpact of varying levels of label noise on the performance of state-of-the-art\nmeta-learners, specifically gradient-based $N$-way $K$-shot learners. We show\nthat the accuracy of Reptile, iMAML, and foMAML drops by up to 42% on the\nOmniglot and CifarFS datasets when meta-training is affected by label noise. To\nstrengthen the resilience against label noise, we propose two sampling\ntechniques, namely manifold (Man) and batch manifold (BatMan), which transform\nthe noisy supervised learners into semi-supervised ones to increase the utility\nof noisy labels. We first construct manifold samples of $N$-way\n$2$-contrastive-shot tasks through augmentation, learning the embedding via a\ncontrastive loss in meta-training, and then perform classification through\nzeroing on the embedding in meta-testing. We show that our approach can\neffectively mitigate the impact of meta-training label noise. Even with 60%\nwrong labels \\batman and \\man can limit the meta-testing accuracy drop to\n${2.5}$, ${9.4}$, ${1.1}$ percent points, respectively, with existing\nmeta-learners across the Omniglot, CifarFS, and MiniImagenet datasets.\n","authors":["Jeroen M. Galjaard","Robert Birke","Juan Perez","Lydia Y. Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06046v1.pdf","comment":"10 pages,3 figures"},{"id":"http://arxiv.org/abs/2105.08511v2","updated":"2023-09-12T08:17:11Z","published":"2021-05-14T15:21:13Z","title":"Privacy-Preserving Constrained Domain Generalization for Medical Image\n Classification","summary":" Deep neural networks (DNN) have demonstrated unprecedented success for\nmedical imaging applications. However, due to the issue of limited dataset\navailability and the strict legal and ethical requirements for patient privacy\nprotection, the broad applications of medical imaging classification driven by\nDNN with large-scale training data have been largely hindered. For example,\nwhen training the DNN from one domain (e.g., with data only from one hospital),\nthe generalization capability to another domain (e.g., data from another\nhospital) could be largely lacking. In this paper, we aim to tackle this\nproblem by developing the privacy-preserving constrained domain generalization\nmethod, aiming to improve the generalization capability under the\nprivacy-preserving condition. In particular, We propose to improve the\ninformation aggregation process on the centralized server-side with a novel\ngradient alignment loss, expecting that the trained model can be better\ngeneralized to the \"unseen\" but related medical images. The rationale and\neffectiveness of our proposed method can be explained by connecting our\nproposed method with the Maximum Mean Discrepancy (MMD) which has been widely\nadopted as the distribution distance measurement. Experimental results on two\nchallenging medical imaging classification tasks indicate that our method can\nachieve better cross-domain generalization capability compared to the\nstate-of-the-art federated learning methods.\n","authors":["Chris Xing Tian","Haoliang Li","Yufei Wang","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2105.08511v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06034v1","updated":"2023-09-12T08:06:04Z","published":"2023-09-12T08:06:04Z","title":"Normality Learning-based Graph Anomaly Detection via Multi-Scale\n Contrastive Learning","summary":" Graph anomaly detection (GAD) has attracted increasing attention in machine\nlearning and data mining. Recent works have mainly focused on how to capture\nricher information to improve the quality of node embeddings for GAD. Despite\ntheir significant advances in detection performance, there is still a relative\ndearth of research on the properties of the task. GAD aims to discern the\nanomalies that deviate from most nodes. However, the model is prone to learn\nthe pattern of normal samples which make up the majority of samples. Meanwhile,\nanomalies can be easily detected when their behaviors differ from normality.\nTherefore, the performance can be further improved by enhancing the ability to\nlearn the normal pattern. To this end, we propose a normality learning-based\nGAD framework via multi-scale contrastive learning networks (NLGAD for\nabbreviation). Specifically, we first initialize the model with the contrastive\nnetworks on different scales. To provide sufficient and reliable normal nodes\nfor normality learning, we design an effective hybrid strategy for normality\nselection. Finally, the model is refined with the only input of reliable normal\nnodes and learns a more accurate estimate of normality so that anomalous nodes\ncan be more easily distinguished. Eventually, extensive experiments on six\nbenchmark graph datasets demonstrate the effectiveness of our normality\nlearning-based scheme on GAD. Notably, the proposed algorithm improves the\ndetection performance (up to 5.89% AUC gain) compared with the state-of-the-art\nmethods. The source code is released at https://github.com/FelixDJC/NLGAD.\n","authors":["Jingcan Duan","Pei Zhang","Siwei Wang","Jingtao Hu","Hu Jin","Jiaxin Zhang","Haifang Zhou","Haifang Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.06034v1.pdf","comment":"10 pages, 7 figures, accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.06033v1","updated":"2023-09-12T08:05:39Z","published":"2023-09-12T08:05:39Z","title":"Energy-Aware Federated Learning with Distributed User Sampling and\n Multichannel ALOHA","summary":" Distributed learning on edge devices has attracted increased attention with\nthe advent of federated learning (FL). Notably, edge devices often have limited\nbattery and heterogeneous energy availability, while multiple rounds are\nrequired in FL for convergence, intensifying the need for energy efficiency.\nEnergy depletion may hinder the training process and the efficient utilization\nof the trained model. To solve these problems, this letter considers the\nintegration of energy harvesting (EH) devices into a FL network with\nmulti-channel ALOHA, while proposing a method to ensure both low energy outage\nprobability and successful execution of future tasks. Numerical results\ndemonstrate the effectiveness of this method, particularly in critical setups\nwhere the average energy income fails to cover the iteration cost. The method\noutperforms a norm based solution in terms of convergence time and battery\nlevel.\n","authors":["Rafael Valente da Silva","Onel L. Alcaraz López","Richard Demo Souza"],"pdf_url":"https://arxiv.org/pdf/2309.06033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06021v1","updated":"2023-09-12T07:40:53Z","published":"2023-09-12T07:40:53Z","title":"Emergent Communication in Multi-Agent Reinforcement Learning for Future\n Wireless Networks","summary":" In different wireless network scenarios, multiple network entities need to\ncooperate in order to achieve a common task with minimum delay and energy\nconsumption. Future wireless networks mandate exchanging high dimensional data\nin dynamic and uncertain environments, therefore implementing communication\ncontrol tasks becomes challenging and highly complex. Multi-agent reinforcement\nlearning with emergent communication (EC-MARL) is a promising solution to\naddress high dimensional continuous control problems with partially observable\nstates in a cooperative fashion where agents build an emergent communication\nprotocol to solve complex tasks. This paper articulates the importance of\nEC-MARL within the context of future 6G wireless networks, which imbues\nautonomous decision-making capabilities into network entities to solve complex\ntasks such as autonomous driving, robot navigation, flying base stations\nnetwork planning, and smart city applications. An overview of EC-MARL\nalgorithms and their design criteria are provided while presenting use cases\nand research opportunities on this emerging topic.\n","authors":["Marwa Chafii","Salmane Naoumi","Reda Alami","Ebtesam Almazrouei","Mehdi Bennis","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2309.06021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06015v1","updated":"2023-09-12T07:29:47Z","published":"2023-09-12T07:29:47Z","title":"Interpolation, Approximation and Controllability of Deep Neural Networks","summary":" We investigate the expressive power of deep residual neural networks\nidealized as continuous dynamical systems through control theory. Specifically,\nwe consider two properties that arise from supervised learning, namely\nuniversal interpolation - the ability to match arbitrary input and target\ntraining samples - and the closely related notion of universal approximation -\nthe ability to approximate input-target functional relationships via flow maps.\nUnder the assumption of affine invariance of the control family, we give a\ncharacterisation of universal interpolation, showing that it holds for\nessentially any architecture with non-linearity. Furthermore, we elucidate the\nrelationship between universal interpolation and universal approximation in the\ncontext of general control systems, showing that the two properties cannot be\ndeduced from each other. At the same time, we identify conditions on the\ncontrol family and the target function that ensures the equivalence of the two\nnotions.\n","authors":["Jingpu Cheng","Qianxiao Li","Ting Lin","Zuowei Shen"],"pdf_url":"https://arxiv.org/pdf/2309.06015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09544v2","updated":"2023-09-12T07:25:49Z","published":"2023-08-18T13:22:59Z","title":"Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free\n Continual Learning","summary":" In this work, we investigate exemplar-free class incremental learning (CIL)\nwith knowledge distillation (KD) as a regularization strategy, aiming to\nprevent forgetting. KD-based methods are successfully used in CIL, but they\noften struggle to regularize the model without access to exemplars of the\ntraining data from previous tasks. Our analysis reveals that this issue\noriginates from substantial representation shifts in the teacher network when\ndealing with out-of-distribution data. This causes large errors in the KD loss\ncomponent, leading to performance degradation in CIL models. Inspired by recent\ntest-time adaptation methods, we introduce Teacher Adaptation (TA), a method\nthat concurrently updates the teacher and the main models during incremental\ntraining. Our method seamlessly integrates with KD-based CIL approaches and\nallows for consistent enhancement of their performance across multiple\nexemplar-free CIL benchmarks.\n","authors":["Filip Szatkowski","Mateusz Pyla","Marcin Przewięźlikowski","Sebastian Cygert","Bartłomiej Twardowski","Tomasz Trzciński"],"pdf_url":"https://arxiv.org/pdf/2308.09544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10705v4","updated":"2023-09-12T07:21:05Z","published":"2023-07-20T08:53:47Z","title":"TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and\n Lane Segmentation in Self-Driving Cars","summary":" Semantic segmentation is a common task in autonomous driving to understand\nthe surrounding environment. Driveable Area Segmentation and Lane Detection are\nparticularly important for safe and efficient navigation on the road. However,\noriginal semantic segmentation models are computationally expensive and require\nhigh-end hardware, which is not feasible for embedded systems in autonomous\nvehicles. This paper proposes a lightweight model for the driveable area and\nlane line segmentation. TwinLiteNet is designed cheaply but achieves accurate\nand efficient segmentation results. We evaluate TwinLiteNet on the BDD100K\ndataset and compare it with modern models. Experimental results show that our\nTwinLiteNet performs similarly to existing approaches, requiring significantly\nfewer computational resources. Specifically, TwinLiteNet achieves a mIoU score\nof 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task\nwith only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000.\nFurthermore, TwinLiteNet can run in real-time on embedded devices with limited\ncomputing power, especially since it achieves 60FPS on Jetson Xavier NX, making\nit an ideal solution for self-driving vehicles. Code is available:\nurl{https://github.com/chequanghuy/TwinLiteNet}.\n","authors":["Quang Huy Che","Dinh Phuc Nguyen","Minh Quan Pham","Duc Khai Lam"],"pdf_url":"https://arxiv.org/pdf/2307.10705v4.pdf","comment":"Accepted by MAPR 2023"},{"id":"http://arxiv.org/abs/2309.05994v1","updated":"2023-09-12T06:49:56Z","published":"2023-09-12T06:49:56Z","title":"ATTA: Anomaly-aware Test-Time Adaptation for Out-of-Distribution\n Detection in Segmentation","summary":" Recent advancements in dense out-of-distribution (OOD) detection have\nprimarily focused on scenarios where the training and testing datasets share a\nsimilar domain, with the assumption that no domain shift exists between them.\nHowever, in real-world situations, domain shift often exits and significantly\naffects the accuracy of existing out-of-distribution (OOD) detection models. In\nthis work, we propose a dual-level OOD detection framework to handle domain\nshift and semantic shift jointly. The first level distinguishes whether domain\nshift exists in the image by leveraging global low-level features, while the\nsecond level identifies pixels with semantic shift by utilizing dense\nhigh-level feature maps. In this way, we can selectively adapt the model to\nunseen domains as well as enhance model's capacity in detecting novel classes.\nWe validate the efficacy of our proposed method on several OOD segmentation\nbenchmarks, including those with significant domain shifts and those without,\nobserving consistent performance improvements across various baseline models.\n","authors":["Zhitong Gao","Shipeng Yan","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2309.05994v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2306.12774v3","updated":"2023-09-12T06:40:00Z","published":"2023-06-22T10:00:33Z","title":"Pure Exploration in Bandits with Linear Constraints","summary":" We address the problem of identifying the optimal policy with a fixed\nconfidence level in a multi-armed bandit setup, when \\emph{the arms are subject\nto linear constraints}. Unlike the standard best-arm identification problem\nwhich is well studied, the optimal policy in this case may not be deterministic\nand could mix between several arms. This changes the geometry of the problem\nwhich we characterize via an information-theoretic lower bound. We introduce\ntwo asymptotically optimal algorithms for this setting, one based on the\nTrack-and-Stop method and the other based on a game-theoretic approach. Both\nthese algorithms try to track an optimal allocation based on the lower bound\nand computed by a weighted projection onto the boundary of a normal cone.\nFinally, we provide empirical results that validate our bounds and visualize\nhow constraints change the hardness of the problem.\n","authors":["Emil Carlsson","Debabrota Basu","Fredrik D. Johansson","Devdatt Dubhashi"],"pdf_url":"https://arxiv.org/pdf/2306.12774v3.pdf","comment":"EWRL16"},{"id":"http://arxiv.org/abs/2309.05981v1","updated":"2023-09-12T06:20:34Z","published":"2023-09-12T06:20:34Z","title":"Learning Unbiased News Article Representations: A Knowledge-Infused\n Approach","summary":" Quantification of the political leaning of online news articles can aid in\nunderstanding the dynamics of political ideology in social groups and measures\nto mitigating them. However, predicting the accurate political leaning of a\nnews article with machine learning models is a challenging task. This is due to\n(i) the political ideology of a news article is defined by several factors, and\n(ii) the innate nature of existing learning models to be biased with the\npolitical bias of the news publisher during the model training. There is only a\nlimited number of methods to study the political leaning of news articles which\nalso do not consider the algorithmic political bias which lowers the\ngeneralization of machine learning models to predict the political leaning of\nnews articles published by any new news publishers. In this work, we propose a\nknowledge-infused deep learning model that utilizes relatively reliable\nexternal data resources to learn unbiased representations of news articles\nusing their global and local contexts. We evaluate the proposed model by\nsetting the data in such a way that news domains or news publishers in the test\nset are completely unseen during the training phase. With this setup we show\nthat the proposed model mitigates algorithmic political bias and outperforms\nbaseline methods to predict the political leaning of news articles with up to\n73% accuracy.\n","authors":["Sadia Kamal","Jimmy Hartford","Jeremy Willis","Arunkumar Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2309.05981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06573v2","updated":"2023-09-12T06:09:54Z","published":"2022-08-13T05:16:40Z","title":"GEDI: A Graph-based End-to-end Data Imputation Framework","summary":" Data imputation is an effective way to handle missing data, which is common\nin practical applications. In this study, we propose and test a novel data\nimputation process that achieve two important goals: (1) preserve the row-wise\nsimilarities among observations and column-wise contextual relationships among\nfeatures in the feature matrix, and (2) tailor the imputation process to\nspecific downstream label prediction task. The proposed imputation process uses\nTransformer network and graph structure learning to iteratively refine the\ncontextual relationships among features and similarities among observations.\nMoreover, it uses a meta-learning framework to select features that are\ninfluential to the downstream prediction task of interest. We conduct\nexperiments on real-world large data sets, and show that the proposed\nimputation process consistently improves imputation and label prediction\nperformance over a variety of benchmark methods.\n","authors":["Katrina Chen","Xiuqin Liang","Zheng Ma","Zhibin Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.06573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05975v1","updated":"2023-09-12T05:55:41Z","published":"2023-09-12T05:55:41Z","title":"CleanUNet 2: A Hybrid Speech Denoising Model on Waveform and Spectrogram","summary":" In this work, we present CleanUNet 2, a speech denoising model that combines\nthe advantages of waveform denoiser and spectrogram denoiser and achieves the\nbest of both worlds. CleanUNet 2 uses a two-stage framework inspired by popular\nspeech synthesis methods that consist of a waveform model and a spectrogram\nmodel. Specifically, CleanUNet 2 builds upon CleanUNet, the state-of-the-art\nwaveform denoiser, and further boosts its performance by taking predicted\nspectrograms from a spectrogram denoiser as the input. We demonstrate that\nCleanUNet 2 outperforms previous methods in terms of various objective and\nsubjective evaluations.\n","authors":["Zhifeng Kong","Wei Ping","Ambrish Dantrey","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2309.05975v1.pdf","comment":"INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2309.05973v1","updated":"2023-09-12T05:51:56Z","published":"2023-09-12T05:51:56Z","title":"Circuit Breaking: Removing Model Behaviors with Targeted Ablation","summary":" Language models often exhibit behaviors that improve performance on a\npre-training objective but harm performance on downstream tasks. We propose a\nnovel approach to removing undesirable behaviors by ablating a small number of\ncausal pathways between model components, with the intention of disabling the\ncomputational circuit responsible for the bad behavior. Given a small dataset\nof inputs where the model behaves poorly, we learn to ablate a small number of\nimportant causal pathways. In the setting of reducing GPT-2 toxic language\ngeneration, we find ablating just 12 of the 11.6K causal edges mitigates toxic\ngeneration with minimal degradation of performance on other inputs.\n","authors":["Maximilian Li","Xander Davies","Max Nadeau"],"pdf_url":"https://arxiv.org/pdf/2309.05973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05968v1","updated":"2023-09-12T05:36:08Z","published":"2023-09-12T05:36:08Z","title":"Neural Network Layer Matrix Decomposition reveals Latent Manifold\n Encoding and Memory Capacity","summary":" We prove the converse of the universal approximation theorem, i.e. a neural\nnetwork (NN) encoding theorem which shows that for every stably converged NN of\ncontinuous activation functions, its weight matrix actually encodes a\ncontinuous function that approximates its training dataset to within a finite\nmargin of error over a bounded domain. We further show that using the\nEckart-Young theorem for truncated singular value decomposition of the weight\nmatrix for every NN layer, we can illuminate the nature of the latent space\nmanifold of the training dataset encoded and represented by every NN layer, and\nthe geometric nature of the mathematical operations performed by each NN layer.\nOur results have implications for understanding how NNs break the curse of\ndimensionality by harnessing memory capacity for expressivity, and that the two\nare complementary. This Layer Matrix Decomposition (LMD) further suggests a\nclose relationship between eigen-decomposition of NN layers and the latest\nadvances in conceptualizations of Hopfield networks and Transformer NN models.\n","authors":["Ng Shyh-Chang","A-Li Luo","Bo Qiu"],"pdf_url":"https://arxiv.org/pdf/2309.05968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04570v2","updated":"2023-09-12T05:11:40Z","published":"2023-07-10T14:02:31Z","title":"A Call to Reflect on Evaluation Practices for Age Estimation:\n Comparative Analysis of the State-of-the-Art and a Unified Benchmark","summary":" Comparing different age estimation methods poses a challenge due to the\nunreliability of published results stemming from inconsistencies in the\nbenchmarking process. Previous studies have reported continuous performance\nimprovements over the past decade using specialized methods; however, our\nfindings challenge these claims. This paper identifies two trivial, yet\npersistent issues with the currently used evaluation protocol and describes how\nto resolve them. We describe our evaluation protocol in detail and provide\nspecific examples of how the protocol should be used. We utilize the protocol\nto offer an extensive comparative analysis for state-of-the-art facial age\nestimation methods. Surprisingly, we find that the performance differences\nbetween the methods are negligible compared to the effect of other factors,\nsuch as facial alignment, facial coverage, image resolution, model\narchitecture, or the amount of data used for pretraining. We use the gained\ninsights to propose using FaRL as the backbone model and demonstrate its\nefficiency. The results emphasize the importance of consistent data\npreprocessing practices for reliable and meaningful comparisons. We make our\nsource code public at\nhttps://github.com/paplhjak/Facial-Age-Estimation-Benchmark.\n","authors":["Jakub Paplham","Vojtech Franc"],"pdf_url":"https://arxiv.org/pdf/2307.04570v2.pdf","comment":"Revised version"},{"id":"http://arxiv.org/abs/2309.05961v1","updated":"2023-09-12T05:03:28Z","published":"2023-09-12T05:03:28Z","title":"Evaluating the Ebb and Flow: An In-depth Analysis of Question-Answering\n Trends across Diverse Platforms","summary":" Community Question Answering (CQA) platforms steadily gain popularity as they\nprovide users with fast responses to their queries. The swiftness of these\nresponses is contingent on a mixture of query-specific and user-related\nelements. This paper scrutinizes these contributing factors within the context\nof six highly popular CQA platforms, identified through their standout\nanswering speed. Our investigation reveals a correlation between the time taken\nto yield the first response to a question and several variables: the metadata,\nthe formulation of the questions, and the level of interaction among users.\nAdditionally, by employing conventional machine learning models to analyze\nthese metadata and patterns of user interaction, we endeavor to predict which\nqueries will receive their initial responses promptly.\n","authors":["Rima Hazra","Agnik Saha","Somnath Banerjee","Animesh Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2309.05961v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2206.00979v4","updated":"2023-09-12T05:02:43Z","published":"2022-06-02T10:50:46Z","title":"Multi-scale Wasserstein Shortest-path Graph Kernels for Graph\n Classification","summary":" Graph kernels are conventional methods for computing graph similarities.\nHowever, most of the R-convolution graph kernels face two challenges: 1) They\ncannot compare graphs at multiple different scales, and 2) they do not consider\nthe distributions of substructures when computing the kernel matrix. These two\nchallenges limit their performances. To mitigate the two challenges, we propose\na novel graph kernel called the Multi-scale Wasserstein Shortest-Path graph\nkernel (MWSP), at the heart of which is the multi-scale shortest-path node\nfeature map, of which each element denotes the number of occurrences of a\nshortest path around a node. A shortest path is represented by the\nconcatenation of all the labels of nodes in it. Since the shortest-path node\nfeature map can only compare graphs at local scales, we incorporate into it the\nmultiple different scales of the graph structure, which are captured by the\ntruncated BFS trees of different depths rooted at each node in a graph. We use\nthe Wasserstein distance to compute the similarity between the multi-scale\nshortest-path node feature maps of two graphs, considering the distributions of\nshortest paths. We empirically validate MWSP on various benchmark graph\ndatasets and demonstrate that it achieves state-of-the-art performance on most\ndatasets.\n","authors":["Wei Ye","Hao Tian","Qijun Chen"],"pdf_url":"https://arxiv.org/pdf/2206.00979v4.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/1912.06845v4","updated":"2023-09-12T04:42:46Z","published":"2019-12-14T13:38:02Z","title":"Empirical and Instance-Dependent Estimation of Markov Chain and Mixing\n Time","summary":" We address the problem of estimating the mixing time of a Markov chain from a\nsingle trajectory of observations. Unlike most previous works which employed\nHilbert space methods to estimate spectral gaps, we opt for an approach based\non contraction with respect to total variation. Specifically, we estimate the\ncontraction coefficient introduced in Wolfer [2020], inspired from Dobrushin's.\nThis quantity, unlike the spectral gap, controls the mixing time up to strong\nuniversal constants and remains applicable to non-reversible chains. We improve\nexisting fully data-dependent confidence intervals around this contraction\ncoefficient, which are both easier to compute and thinner than spectral\ncounterparts. Furthermore, we introduce a novel analysis beyond the worst-case\nscenario by leveraging additional information about the transition matrix. This\nallows us to derive instance-dependent rates for estimating the matrix with\nrespect to the induced uniform norm, and some of its mixing properties.\n","authors":["Geoffrey Wolfer"],"pdf_url":"https://arxiv.org/pdf/1912.06845v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05953v1","updated":"2023-09-12T04:21:30Z","published":"2023-09-12T04:21:30Z","title":"GLAD: Content-aware Dynamic Graphs For Log Anomaly Detection","summary":" Logs play a crucial role in system monitoring and debugging by recording\nvaluable system information, including events and states. Although various\nmethods have been proposed to detect anomalies in log sequences, they often\noverlook the significance of considering relations among system components,\nsuch as services and users, which can be identified from log contents.\nUnderstanding these relations is vital for detecting anomalies and their\nunderlying causes. To address this issue, we introduce GLAD, a Graph-based Log\nAnomaly Detection framework designed to detect relational anomalies in system\nlogs. GLAD incorporates log semantics, relational patterns, and sequential\npatterns into a unified framework for anomaly detection. Specifically, GLAD\nfirst introduces a field extraction module that utilizes prompt-based few-shot\nlearning to identify essential fields from log contents. Then GLAD constructs\ndynamic log graphs for sliding windows by interconnecting extracted fields and\nlog events parsed from the log parser. These graphs represent events and fields\nas nodes and their relations as edges. Subsequently, GLAD utilizes a\ntemporal-attentive graph edge anomaly detection model for identifying anomalous\nrelations in these dynamic log graphs. This model employs a Graph Neural\nNetwork (GNN)-based encoder enhanced with transformers to capture content,\nstructural and temporal features. We evaluate our proposed method on three\ndatasets, and the results demonstrate the effectiveness of GLAD in detecting\nanomalies indicated by varying relational patterns.\n","authors":["Yufei Li","Yanchi Liu","Haoyu Wang","Zhengzhang Chen","Wei Cheng","Yuncong Chen","Wenchao Yu","Haifeng Chen","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05953v1.pdf","comment":"Accepted by ICKG 2023"},{"id":"http://arxiv.org/abs/2309.05950v1","updated":"2023-09-12T04:03:41Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities across a variety of vision and multimodal\ntasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box\nsetting, requiring access to model parameters for backpropagation. However,\nmany VLMs rely on proprietary data and are not open-source, which restricts the\nuse of white-box approaches for fine-tuning. Given that popular private large\nlanguage models (LLMs) like ChatGPT still offer a language-based user\ninterface, we aim to develop a novel fine-tuning approach for VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or output logits. In this setup, we propose employing\nchat-based LLMs as black-box optimizers to search for the best text prompt on\nthe illustrative task of few-shot image classification using CLIP.\nSpecifically, we adopt an automatic \"hill-climbing\" procedure that converges on\nan effective prompt by evaluating the accuracy of current prompts and asking\nLLMs to refine them based on textual feedback, all within a conversational\nprocess without human-in-the-loop. In a challenging 1-shot learning setup, our\nsimple approach surpasses the white-box continuous prompting method CoOp by an\naverage of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms OpenAI's manually crafted prompts and is more efficient than other\nblack-box methods like iterative APE. Additionally, we highlight the advantage\nof conversational feedback incorporating both positive and negative prompts,\nsuggesting that LLMs can utilize the implicit \"gradient\" direction in textual\nfeedback for a more efficient search. Lastly, we find that the text prompts\ngenerated through our strategy are not only more interpretable but also\ntransfer well across different CLIP architectures in a black-box manner.\n","authors":["Samuel Yu","Shihong Liu","Zhiqiu Lin","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15930v2","updated":"2023-09-12T03:41:35Z","published":"2023-08-30T10:12:39Z","title":"LLaSM: Large Language and Speech Model","summary":" Multi-modal large language models have garnered significant interest\nrecently. Though, most of the works focus on vision-language multi-modal models\nproviding strong capabilities in following vision-and-language instructions.\nHowever, we claim that speech is also an important modality through which\nhumans interact with the world. Hence, it is crucial for a general-purpose\nassistant to be able to follow multi-modal speech-and-language instructions. In\nthis work, we propose Large Language and Speech Model (LLaSM). LLaSM is an\nend-to-end trained large multi-modal speech-language model with cross-modal\nconversational abilities, capable of following speech-and-language\ninstructions. Our early experiments show that LLaSM demonstrates a more\nconvenient and natural way for humans to interact with artificial intelligence.\nSpecifically, we also release a large Speech Instruction Following dataset\nLLaSM-Audio-Instructions. Code and demo are available at\nhttps://github.com/LinkSoul-AI/LLaSM and\nhttps://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions\ndataset is available at\nhttps://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions.\n","authors":["Yu Shu","Siwei Dong","Guangyao Chen","Wenhao Huang","Ruihua Zhang","Daochen Shi","Qiqi Xiang","Yemin Shi"],"pdf_url":"https://arxiv.org/pdf/2308.15930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1908.01135v3","updated":"2023-09-12T03:34:50Z","published":"2019-08-03T08:20:54Z","title":"Multiplayer Bandit Learning, from Competition to Cooperation","summary":" The stochastic multi-armed bandit model captures the tradeoff between\nexploration and exploitation. We study the effects of competition and\ncooperation on this tradeoff. Suppose there are $k$ arms and two players, Alice\nand Bob. In every round, each player pulls an arm, receives the resulting\nreward, and observes the choice of the other player but not their reward.\nAlice's utility is $\\Gamma_A + \\lambda \\Gamma_B$ (and similarly for Bob), where\n$\\Gamma_A$ is Alice's total reward and $\\lambda \\in [-1, 1]$ is a cooperation\nparameter. At $\\lambda = -1$ the players are competing in a zero-sum game, at\n$\\lambda = 1$, they are fully cooperating, and at $\\lambda = 0$, they are\nneutral: each player's utility is their own reward. The model is related to the\neconomics literature on strategic experimentation, where usually players\nobserve each other's rewards.\n With discount factor $\\beta$, the Gittins index reduces the one-player\nproblem to the comparison between a risky arm, with a prior $\\mu$, and a\npredictable arm, with success probability $p$. The value of $p$ where the\nplayer is indifferent between the arms is the Gittins index $g = g(\\mu,\\beta) >\nm$, where $m$ is the mean of the risky arm.\n We show that competing players explore less than a single player: there is\n$p^* \\in (m, g)$ so that for all $p > p^*$, the players stay at the predictable\narm. However, the players are not myopic: they still explore for some $p > m$.\nOn the other hand, cooperating players explore more than a single player. We\nalso show that neutral players learn from each other, receiving strictly higher\ntotal rewards than they would playing alone, for all $ p\\in (p^*, g)$, where\n$p^*$ is the threshold from the competing case.\n Finally, we show that competing and neutral players eventually settle on the\nsame arm in every Nash equilibrium, while this can fail for cooperating\nplayers.\n","authors":["Simina Brânzei","Yuval Peres"],"pdf_url":"https://arxiv.org/pdf/1908.01135v3.pdf","comment":"Improved version with a few corrections. 57 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.03224v3","updated":"2023-09-12T03:03:00Z","published":"2023-09-01T13:10:54Z","title":"No Train Still Gain. Unleash Mathematical Reasoning of Large Language\n Models with Monte Carlo Tree Search Guided by Energy Function","summary":" Large language models (LLMs) demonstrate impressive language understanding\nand contextual learning abilities, making them suitable for natural language\nprocessing (NLP) tasks and complex mathematical reasoning. However, when\napplied to mathematical reasoning tasks, LLMs often struggle to generate\ncorrect reasoning steps and answers despite having high probabilities for the\nsolutions. To overcome this limitation and enhance the mathematical reasoning\ncapabilities of fine-tuned LLMs without additional fine-tuning steps, we\npropose a method that incorporates Monte Carlo Tree Search (MCTS) and a\nlightweight energy function to rank decision steps and enable immediate\nreaction and precise reasoning. Specifically, we re-formulate the fine-tuned\nLLMs into a Residual-based Energy Model (Residual-EBM) and employ noise\ncontrastive estimation to estimate the energy function's parameters. We then\nutilize MCTS with the energy function as a path verifier to search the output\nspace and evaluate the reasoning path. Through extensive experiments on two\nmathematical reasoning benchmarks, GSM8k and AQUA-RAT, we demonstrate the\nexceptional capabilities of our method, which significantly improves the pass@1\nmetric of the fine-tuned model without requiring additional fine-tuning or\nreinforcement learning with human feedback alignment.\n","authors":["Haotian Xu"],"pdf_url":"https://arxiv.org/pdf/2309.03224v3.pdf","comment":"still in progress"},{"id":"http://arxiv.org/abs/2309.05665v2","updated":"2023-09-12T03:01:55Z","published":"2023-09-11T17:59:17Z","title":"Robot Parkour Learning","summary":" Parkour is a grand challenge for legged locomotion that requires robots to\novercome various obstacles rapidly in complex environments. Existing methods\ncan generate either diverse but blind locomotion skills or vision-based but\nspecialized skills by using reference animal data or complex rewards. However,\nautonomous parkour requires robots to learn generalizable skills that are both\nvision-based and diverse to perceive and react to various scenarios. In this\nwork, we propose a system for learning a single end-to-end vision-based parkour\npolicy of diverse parkour skills using a simple reward without any reference\nmotion data. We develop a reinforcement learning method inspired by direct\ncollocation to generate parkour skills, including climbing over high obstacles,\nleaping over large gaps, crawling beneath low barriers, squeezing through thin\nslits, and running. We distill these skills into a single vision-based parkour\npolicy and transfer it to a quadrupedal robot using its egocentric depth\ncamera. We demonstrate that our system can empower two different low-cost\nrobots to autonomously select and execute appropriate parkour skills to\ntraverse challenging real-world environments.\n","authors":["Ziwen Zhuang","Zipeng Fu","Jianren Wang","Christopher Atkeson","Soeren Schwertfeger","Chelsea Finn","Hang Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.05665v2.pdf","comment":"CoRL 2023 (Oral). Project website at https://robot-parkour.github.io"},{"id":"http://arxiv.org/abs/2309.05927v1","updated":"2023-09-12T02:59:26Z","published":"2023-09-12T02:59:26Z","title":"Frequency-Aware Masked Autoencoders for Multimodal Pretraining on\n Biosignals","summary":" Leveraging multimodal information from biosignals is vital for building a\ncomprehensive representation of people's physical and mental states. However,\nmultimodal biosignals often exhibit substantial distributional shifts between\npretraining and inference datasets, stemming from changes in task specification\nor variations in modality compositions. To achieve effective pretraining in the\npresence of potential distributional shifts, we propose a frequency-aware\nmasked autoencoder ($\\texttt{bio}$FAME) that learns to parameterize the\nrepresentation of biosignals in the frequency space. $\\texttt{bio}$FAME\nincorporates a frequency-aware transformer, which leverages a fixed-size\nFourier-based operator for global token mixing, independent of the length and\nsampling rate of inputs. To maintain the frequency components within each input\nchannel, we further employ a frequency-maintain pretraining strategy that\nperforms masked autoencoding in the latent space. The resulting architecture\neffectively utilizes multimodal information during pretraining, and can be\nseamlessly adapted to diverse tasks and modalities at test time, regardless of\ninput size and order. We evaluated our approach on a diverse set of transfer\nexperiments on unimodal time series, achieving an average of $\\uparrow$5.5%\nimprovement in classification accuracy over the previous state-of-the-art.\nFurthermore, we demonstrated that our architecture is robust in modality\nmismatch scenarios, including unpredicted modality dropout or substitution,\nproving its practical utility in real-world applications. Code will be\navailable soon.\n","authors":["Ran Liu","Ellen L. Zippi","Hadi Pouransari","Chris Sandino","Jingping Nie","Hanlin Goh","Erdrin Azemi","Ali Moin"],"pdf_url":"https://arxiv.org/pdf/2309.05927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05925v1","updated":"2023-09-12T02:52:40Z","published":"2023-09-12T02:52:40Z","title":"On Regularized Sparse Logistic Regression","summary":" Sparse logistic regression aims to perform classification and feature\nselection simultaneously for high-dimensional data. Although many studies have\nbeen done to solve $\\ell_1$-regularized logistic regression, there is no\nequivalently abundant literature about solving sparse logistic regression\nassociated with nonconvex penalties. In this paper, we propose to solve\n$\\ell_1$-regularized sparse logistic regression and some nonconvex\npenalties-regularized sparse logistic regression, when the nonconvex penalties\nsatisfy some prerequisites, with similar optimization frameworks. In the\nproposed optimization frameworks, we utilize different line search criteria to\nguarantee good convergence performance for different regularization terms.\nEmpirical experiments on binary classification tasks with real-world datasets\ndemonstrate our proposed algorithms are capable of performing classification\nand feature selection effectively with a lower computational cost.\n","authors":["Mengyuan Zhang","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05925v1.pdf","comment":"Accepted to ICDM2023"},{"id":"http://arxiv.org/abs/2308.00264v2","updated":"2023-09-12T02:40:08Z","published":"2023-08-01T03:54:27Z","title":"Multi-Modality Multi-Loss Fusion Network","summary":" In this work we investigate the optimal selection and fusion of features\nacross multiple modalities and combine these in a neural network to improve\nemotion detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nuseful findings relating to subnet performance. Our best model achieves\nstate-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and\nCH-SIMS), and outperforms the other methods in most metrics. We have found that\ntraining on multimodal features improves single modality testing and designing\nfusion methods based on dataset annotation schema enhances model performance.\nThese results suggest a roadmap towards an optimized feature selection and\nfusion approach for enhancing emotion detection in neural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v2.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2309.02530v2","updated":"2023-09-12T02:16:45Z","published":"2023-09-05T18:52:35Z","title":"Diffusion on the Probability Simplex","summary":" Diffusion models learn to reverse the progressive noising of a data\ndistribution to create a generative model. However, the desired continuous\nnature of the noising process can be at odds with discrete data. To deal with\nthis tension between continuous and discrete objects, we propose a method of\nperforming diffusion on the probability simplex. Using the probability simplex\nnaturally creates an interpretation where points correspond to categorical\nprobability distributions. Our method uses the softmax function applied to an\nOrnstein-Unlenbeck Process, a well-known stochastic differential equation. We\nfind that our methodology also naturally extends to include diffusion on the\nunit cube which has applications for bounded image generation.\n","authors":["Griffin Floto","Thorsteinn Jonsson","Mihai Nica","Scott Sanner","Eric Zhengyu Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.02530v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05915v1","updated":"2023-09-12T02:05:43Z","published":"2023-09-12T02:05:43Z","title":"ACT: Empowering Decision Transformer with Dynamic Programming via\n Advantage Conditioning","summary":" Decision Transformer (DT), which employs expressive sequence modeling\ntechniques to perform action generation, has emerged as a promising approach to\noffline policy optimization. However, DT generates actions conditioned on a\ndesired future return, which is known to bear some weaknesses such as the\nsusceptibility to environmental stochasticity. To overcome DT's weaknesses, we\npropose to empower DT with dynamic programming. Our method comprises three\nsteps. First, we employ in-sample value iteration to obtain approximated value\nfunctions, which involves dynamic programming over the MDP structure. Second,\nwe evaluate action quality in context with estimated advantages. We introduce\ntwo types of advantage estimators, IAE and GAE, which are suitable for\ndifferent tasks. Third, we train an Advantage-Conditioned Transformer (ACT) to\ngenerate actions conditioned on the estimated advantages. Finally, during\ntesting, ACT generates actions conditioned on a desired advantage. Our\nevaluation results validate that, by leveraging the power of dynamic\nprogramming, ACT demonstrates effective trajectory stitching and robust action\ngeneration in spite of the environmental stochasticity, outperforming baseline\nmethods across various benchmarks. Additionally, we conduct an in-depth\nanalysis of ACT's various design choices through ablation studies.\n","authors":["Chenxiao Gao","Chenyang Wu","Mingjun Cao","Rui Kong","Zongzhang Zhang","Yang Yu"],"pdf_url":"https://arxiv.org/pdf/2309.05915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05257v3","updated":"2023-09-12T02:00:14Z","published":"2023-04-11T14:46:38Z","title":"Multi-granulariy Time-based Transformer for Knowledge Tracing","summary":" In this paper, we present a transformer architecture for predicting student\nperformance on standardized tests. Specifically, we leverage students\nhistorical data, including their past test scores, study habits, and other\nrelevant information, to create a personalized model for each student. We then\nuse these models to predict their future performance on a given test. Applying\nthis model to the RIIID dataset, we demonstrate that using multiple\ngranularities for temporal features as the decoder input significantly improve\nmodel performance. Our results also show the effectiveness of our approach,\nwith substantial improvements over the LightGBM method. Our work contributes to\nthe growing field of AI in education, providing a scalable and accurate tool\nfor predicting student outcomes.\n","authors":["Tong Zhou"],"pdf_url":"https://arxiv.org/pdf/2304.05257v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01347v4","updated":"2023-09-12T01:20:11Z","published":"2023-03-31T02:54:01Z","title":"Temporal Dynamic Synchronous Functional Brain Network for Schizophrenia\n Diagnosis and Lateralization Analysis","summary":" The available evidence suggests that dynamic functional connectivity (dFC)\ncan capture time-varying abnormalities in brain activity in resting-state\ncerebral functional magnetic resonance imaging (rs-fMRI) data and has a natural\nadvantage in uncovering mechanisms of abnormal brain activity in\nschizophrenia(SZ) patients. Hence, an advanced dynamic brain network analysis\nmodel called the temporal brain category graph convolutional network\n(Temporal-BCGCN) was employed. Firstly, a unique dynamic brain network analysis\nmodule, DSF-BrainNet, was designed to construct dynamic synchronization\nfeatures. Subsequently, a revolutionary graph convolution method, TemporalConv,\nwas proposed, based on the synchronous temporal properties of feature. Finally,\nthe first modular abnormal hemispherical lateralization test tool in deep\nlearning based on rs-fMRI data, named CategoryPool, was proposed. This study\nwas validated on COBRE and UCLA datasets and achieved 83.62% and 89.71% average\naccuracies, respectively, outperforming the baseline model and other\nstate-of-the-art methods. The ablation results also demonstrate the advantages\nof TemporalConv over the traditional edge feature graph convolution approach\nand the improvement of CategoryPool over the classical graph pooling approach.\nInterestingly, this study showed that the lower order perceptual system and\nhigher order network regions in the left hemisphere are more severely\ndysfunctional than in the right hemisphere in SZ and reaffirms the importance\nof the left medial superior frontal gyrus in SZ. Our core code is available at:\nhttps://github.com/swfen/Temporal-BCGCN.\n","authors":["Cheng Zhu","Ying Tan","Shuqi Yang","Jiaqing Miao","Jiayi Zhu","Huan Huang","Dezhong Yao","Cheng Luo"],"pdf_url":"https://arxiv.org/pdf/2304.01347v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05900v1","updated":"2023-09-12T01:03:43Z","published":"2023-09-12T01:03:43Z","title":"Adversarial Attacks Assessment of Salient Object Detection via Symbolic\n Learning","summary":" Machine learning is at the center of mainstream technology and outperforms\nclassical approaches to handcrafted feature design. Aside from its learning\nprocess for artificial feature extraction, it has an end-to-end paradigm from\ninput to output, reaching outstandingly accurate results. However, security\nconcerns about its robustness to malicious and imperceptible perturbations have\ndrawn attention since its prediction can be changed entirely. Salient object\ndetection is a research area where deep convolutional neural networks have\nproven effective but whose trustworthiness represents a significant issue\nrequiring analysis and solutions to hackers' attacks. Brain programming is a\nkind of symbolic learning in the vein of good old-fashioned artificial\nintelligence. This work provides evidence that symbolic learning robustness is\ncrucial in designing reliable visual attention systems since it can withstand\neven the most intense perturbations. We test this evolutionary computation\nmethodology against several adversarial attacks and noise perturbations using\nstandard databases and a real-world problem of a shorebird called the Snowy\nPlover portraying a visual attention task. We compare our methodology with five\ndifferent deep learning approaches, proving that they do not match the symbolic\nparadigm regarding robustness. All neural networks suffer significant\nperformance losses, while brain programming stands its ground and remains\nunaffected. Also, by studying the Snowy Plover, we remark on the importance of\nsecurity in surveillance activities regarding wildlife protection and\nconservation.\n","authors":["Gustavo Olague","Roberto Pineda","Gerardo Ibarra-Vazquez","Matthieu Olague","Axel Martinez","Sambit Bakshi","Jonathan Vargas","Isnardo Reducindo"],"pdf_url":"https://arxiv.org/pdf/2309.05900v1.pdf","comment":"14 pages, 8 figures, 6 tables, IEEE Transactions on Emerging Topics\n in Computing, Accepted for publication"},{"id":"http://arxiv.org/abs/2309.05883v1","updated":"2023-09-12T00:07:08Z","published":"2023-09-12T00:07:08Z","title":"Hierarchical Conditional Semi-Paired Image-to-Image Translation For\n Multi-Task Image Defect Correction On Shopping Websites","summary":" On shopping websites, product images of low quality negatively affect\ncustomer experience. Although there are plenty of work in detecting images with\ndifferent defects, few efforts have been dedicated to correct those defects at\nscale. A major challenge is that there are thousands of product types and each\nhas specific defects, therefore building defect specific models is unscalable.\nIn this paper, we propose a unified Image-to-Image (I2I) translation model to\ncorrect multiple defects across different product types. Our model leverages an\nattention mechanism to hierarchically incorporate high-level defect groups and\nspecific defect types to guide the network to focus on defect-related image\nregions. Evaluated on eight public datasets, our model reduces the Frechet\nInception Distance (FID) by 24.6% in average compared with MoNCE, the\nstate-of-the-art I2I method. Unlike public data, another practical challenge on\nshopping websites is that some paired images are of low quality. Therefore we\ndesign our model to be semi-paired by combining the L1 loss of paired data with\nthe cycle loss of unpaired data. Tested on a shopping website dataset to\ncorrect three image defects, our model reduces (FID) by 63.2% in average\ncompared with WS-I2I, the state-of-the art semi-paired I2I method.\n","authors":["Moyan Li","Jinmiao Fu","Shaoyuan Xu","Huidong Liu","Jia Liu","Bryan Wang"],"pdf_url":"https://arxiv.org/pdf/2309.05883v1.pdf","comment":"6 pages, 6 figures, 3 tables. To be published in ICIP 2023"},{"id":"http://arxiv.org/abs/2309.05879v1","updated":"2023-09-12T00:00:24Z","published":"2023-09-12T00:00:24Z","title":"Generalized Attacks on Face Verification Systems","summary":" Face verification (FV) using deep neural network models has made tremendous\nprogress in recent years, surpassing human accuracy and seeing deployment in\nvarious applications such as border control and smartphone unlocking. However,\nFV systems are vulnerable to Adversarial Attacks, which manipulate input images\nto deceive these systems in ways usually unnoticeable to humans. This paper\nprovides an in-depth study of attacks on FV systems. We introduce the\nDodgePersonation Attack that formulates the creation of face images that\nimpersonate a set of given identities while avoiding being identified as any of\nthe identities in a separate, disjoint set. A taxonomy is proposed to provide a\nunified view of different types of Adversarial Attacks against FV systems,\nincluding Dodging Attacks, Impersonation Attacks, and Master Face Attacks.\nFinally, we propose the ''One Face to Rule Them All'' Attack which implements\nthe DodgePersonation Attack with state-of-the-art performance on a well-known\nscenario (Master Face Attack) and which can also be used for the new scenarios\nintroduced in this paper. While the state-of-the-art Master Face Attack can\nproduce a set of 9 images to cover 43.82% of the identities in their test\ndatabase, with 9 images our attack can cover 57.27% to 58.5% of these\nidentifies while giving the attacker the choice of the identity to use to\ncreate the impersonation. Moreover, the 9 generated attack images appear\nidentical to a casual observer.\n","authors":["Ehsan Nazari","Paula Branco","Guy-Vincent Jourdan"],"pdf_url":"https://arxiv.org/pdf/2309.05879v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06645v1","updated":"2023-09-12T23:54:24Z","published":"2023-09-12T23:54:24Z","title":"Bregman Graph Neural Network","summary":" Numerous recent research on graph neural networks (GNNs) has focused on\nformulating GNN architectures as an optimization problem with the smoothness\nassumption. However, in node classification tasks, the smoothing effect induced\nby GNNs tends to assimilate representations and over-homogenize labels of\nconnected nodes, leading to adverse effects such as over-smoothing and\nmisclassification. In this paper, we propose a novel bilevel optimization\nframework for GNNs inspired by the notion of Bregman distance. We demonstrate\nthat the GNN layer proposed accordingly can effectively mitigate the\nover-smoothing issue by introducing a mechanism reminiscent of the \"skip\nconnection\". We validate our theoretical results through comprehensive\nempirical studies in which Bregman-enhanced GNNs outperform their original\ncounterparts in both homophilic and heterophilic graphs. Furthermore, our\nexperiments also show that Bregman GNNs can produce more robust learning\naccuracy even when the number of layers is high, suggesting the effectiveness\nof the proposed method in alleviating the over-smoothing issue.\n","authors":["Jiayu Zhai","Lequan Lin","Dai Shi","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2309.06645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06642v1","updated":"2023-09-12T23:41:29Z","published":"2023-09-12T23:41:29Z","title":"Adapt and Diffuse: Sample-adaptive Reconstruction via Latent Diffusion\n Models","summary":" Inverse problems arise in a multitude of applications, where the goal is to\nrecover a clean signal from noisy and possibly (non)linear observations. The\ndifficulty of a reconstruction problem depends on multiple factors, such as the\nstructure of the ground truth signal, the severity of the degradation, the\nimplicit bias of the reconstruction model and the complex interactions between\nthe above factors. This results in natural sample-by-sample variation in the\ndifficulty of a reconstruction task, which is often overlooked by contemporary\ntechniques. Recently, diffusion-based inverse problem solvers have established\nnew state-of-the-art in various reconstruction tasks. However, they have the\ndrawback of being computationally prohibitive. Our key observation in this\npaper is that most existing solvers lack the ability to adapt their compute\npower to the difficulty of the reconstruction task, resulting in long inference\ntimes, subpar performance and wasteful resource allocation. We propose a novel\nmethod that we call severity encoding, to estimate the degradation severity of\nnoisy, degraded signals in the latent space of an autoencoder. We show that the\nestimated severity has strong correlation with the true corruption level and\ncan give useful hints at the difficulty of reconstruction problems on a\nsample-by-sample basis. Furthermore, we propose a reconstruction method based\non latent diffusion models that leverages the predicted degradation severities\nto fine-tune the reverse diffusion sampling trajectory and thus achieve\nsample-adaptive inference times. We utilize latent diffusion posterior sampling\nto maintain data consistency with observations. We perform experiments on both\nlinear and nonlinear inverse problems and demonstrate that our technique\nachieves performance comparable to state-of-the-art diffusion-based techniques,\nwith significant improvements in computational efficiency.\n","authors":["Zalan Fabian","Berk Tinaz","Mahdi Soltanolkotabi"],"pdf_url":"https://arxiv.org/pdf/2309.06642v1.pdf","comment":"14 pages, 6 figures, preliminary version"},{"id":"http://arxiv.org/abs/2205.08340v4","updated":"2023-09-12T23:36:23Z","published":"2022-05-17T13:34:45Z","title":"A unified framework for dataset shift diagnostics","summary":" Supervised learning techniques typically assume training data originates from\nthe target population. Yet, in reality, dataset shift frequently arises, which,\nif not adequately taken into account, may decrease the performance of their\npredictors. In this work, we propose a novel and flexible framework called\nDetectShift that quantifies and tests for multiple dataset shifts, encompassing\nshifts in the distributions of $(X, Y)$, $X$, $Y$, $X|Y$, and $Y|X$.\nDetectShift equips practitioners with insights into data shifts, facilitating\nthe adaptation or retraining of predictors using both source and target data.\nThis proves extremely valuable when labeled samples in the target domain are\nlimited. The framework utilizes test statistics with the same nature to\nquantify the magnitude of the various shifts, making results more\ninterpretable. It is versatile, suitable for regression and classification\ntasks, and accommodates diverse data forms - tabular, text, or image.\nExperimental results demonstrate the effectiveness of DetectShift in detecting\ndataset shifts even in higher dimensions.\n","authors":["Felipe Maia Polo","Rafael Izbicki","Evanildo Gomes Lacerda Jr","Juan Pablo Ibieta-Jimenez","Renato Vicente"],"pdf_url":"https://arxiv.org/pdf/2205.08340v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06641v1","updated":"2023-09-12T23:24:38Z","published":"2023-09-12T23:24:38Z","title":"Quantum Data Center: Perspectives","summary":" A quantum version of data centers might be significant in the quantum era. In\nthis paper, we introduce Quantum Data Center (QDC), a quantum version of\nexisting classical data centers, with a specific emphasis on combining Quantum\nRandom Access Memory (QRAM) and quantum networks. We argue that QDC will\nprovide significant benefits to customers in terms of efficiency, security, and\nprecision, and will be helpful for quantum computing, communication, and\nsensing. We investigate potential scientific and business opportunities along\nthis novel research direction through hardware realization and possible\nspecific applications. We show the possible impacts of QDCs in business and\nscience, especially the machine learning and big data industries.\n","authors":["Junyu Liu","Liang Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.06641v1.pdf","comment":"9 pages, many figures. This is a perspective papers introducing the\n ideas and impacts of quantum data centers in arXiv:2207.14336"},{"id":"http://arxiv.org/abs/2202.09518v4","updated":"2023-09-12T23:16:07Z","published":"2022-02-19T03:49:21Z","title":"Distributed Out-of-Memory NMF on CPU/GPU Architectures","summary":" We propose an efficient distributed out-of-memory implementation of the\nNon-negative Matrix Factorization (NMF) algorithm for heterogeneous\nhigh-performance-computing (HPC) systems. The proposed implementation is based\non prior work on NMFk, which can perform automatic model selection and extract\nlatent variables and patterns from data. In this work, we extend NMFk by adding\nsupport for dense and sparse matrix operation on multi-node, multi-GPU systems.\nThe resulting algorithm is optimized for out-of-memory (OOM) problems where the\nmemory required to factorize a given matrix is greater than the available GPU\nmemory. Memory complexity is reduced by batching/tiling strategies, and sparse\nand dense matrix operations are significantly accelerated with GPU cores (or\ntensor cores when available). Input/Output (I/O) latency associated with batch\ncopies between host and device is hidden using CUDA streams to overlap data\ntransfers and compute asynchronously, and latency associated with collective\ncommunications (both intra-node and inter-node) is reduced using optimized\nNVIDIA Collective Communication Library NCCL based communicators. Benchmark\nresults show significant improvement, from 32X to 76x speedup, with the new\nimplementation using GPUs over the CPU-based NMFk. Good weak scaling was\ndemonstrated on up to 4096 multi-GPU cluster nodes with approximately 25,000\nGPUs when decomposing a dense 340 Terabyte-size matrix and an 11 Exabyte-size\nsparse matrix of density 10e-6.\n","authors":["Ismael Boureima","Manish Bhattarai","Maksim Eren","Erik Skau","Philip Romero","Stephan Eidenbenz","Boian Alexandrov"],"pdf_url":"https://arxiv.org/pdf/2202.09518v4.pdf","comment":"Accepted at Journal of Supercomputing"},{"id":"http://arxiv.org/abs/2309.06634v1","updated":"2023-09-12T22:51:16Z","published":"2023-09-12T22:51:16Z","title":"$G$-Mapper: Learning a Cover in the Mapper Construction","summary":" The Mapper algorithm is a visualization technique in topological data\nanalysis (TDA) that outputs a graph reflecting the structure of a given\ndataset. The Mapper algorithm requires tuning several parameters in order to\ngenerate a \"nice\" Mapper graph. The paper focuses on selecting the cover\nparameter. We present an algorithm that optimizes the cover of a Mapper graph\nby splitting a cover repeatedly according to a statistical test for normality.\nOur algorithm is based on $G$-means clustering which searches for the optimal\nnumber of clusters in $k$-means by conducting iteratively the Anderson-Darling\ntest. Our splitting procedure employs a Gaussian mixture model in order to\nchoose carefully the cover based on the distribution of a given data.\nExperiments for synthetic and real-world datasets demonstrate that our\nalgorithm generates covers so that the Mapper graphs retain the essence of the\ndatasets.\n","authors":["Enrique Alvarado","Robin Belton","Emily Fischer","Kang-Ju Lee","Sourabh Palande","Sarah Percival","Emilie Purvine"],"pdf_url":"https://arxiv.org/pdf/2309.06634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.14336v3","updated":"2023-09-12T22:43:31Z","published":"2022-07-28T18:34:36Z","title":"Data centers with quantum random access memory and quantum networks","summary":" In this paper, we propose the Quantum Data Center (QDC), an architecture\ncombining Quantum Random Access Memory (QRAM) and quantum networks. We give a\nprecise definition of QDC, and discuss its possible realizations and\nextensions. We discuss applications of QDC in quantum computation, quantum\ncommunication, and quantum sensing, with a primary focus on QDC for $T$-gate\nresources, QDC for multi-party private quantum communication, and QDC for\ndistributed sensing through data compression. We show that QDC will provide\nefficient, private, and fast services as a future version of data centers.\n","authors":["Junyu Liu","Connor T. Hann","Liang Jiang"],"pdf_url":"https://arxiv.org/pdf/2207.14336v3.pdf","comment":"23 pages, many figures"},{"id":"http://arxiv.org/abs/2309.06628v1","updated":"2023-09-12T22:34:34Z","published":"2023-09-12T22:34:34Z","title":"Epistemic Modeling Uncertainty of Rapid Neural Network Ensembles for\n Adaptive Learning","summary":" Emulator embedded neural networks, which are a type of physics informed\nneural network, leverage multi-fidelity data sources for efficient design\nexploration of aerospace engineering systems. Multiple realizations of the\nneural network models are trained with different random initializations. The\nensemble of model realizations is used to assess epistemic modeling uncertainty\ncaused due to lack of training samples. This uncertainty estimation is crucial\ninformation for successful goal-oriented adaptive learning in an aerospace\nsystem design exploration. However, the costs of training the ensemble models\noften become prohibitive and pose a computational challenge, especially when\nthe models are not trained in parallel during adaptive learning. In this work,\na new type of emulator embedded neural network is presented using the rapid\nneural network paradigm. Unlike the conventional neural network training that\noptimizes the weights and biases of all the network layers by using\ngradient-based backpropagation, rapid neural network training adjusts only the\nlast layer connection weights by applying a linear regression technique. It is\nfound that the proposed emulator embedded neural network trains\nnear-instantaneously, typically without loss of prediction accuracy. The\nproposed method is demonstrated on multiple analytical examples, as well as an\naerospace flight parameter study of a generic hypersonic vehicle.\n","authors":["Atticus Beachy","Harok Bae","Jose Camberos","Ramana Grandhi"],"pdf_url":"https://arxiv.org/pdf/2309.06628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06627v1","updated":"2023-09-12T22:31:57Z","published":"2023-09-12T22:31:57Z","title":"A Sequentially Fair Mechanism for Multiple Sensitive Attributes","summary":" In the standard use case of Algorithmic Fairness, the goal is to eliminate\nthe relationship between a sensitive variable and a corresponding score.\nThroughout recent years, the scientific community has developed a host of\ndefinitions and tools to solve this task, which work well in many practical\napplications. However, the applicability and effectivity of these tools and\ndefinitions becomes less straightfoward in the case of multiple sensitive\nattributes. To tackle this issue, we propose a sequential framework, which\nallows to progressively achieve fairness across a set of sensitive features. We\naccomplish this by leveraging multi-marginal Wasserstein barycenters, which\nextends the standard notion of Strong Demographic Parity to the case with\nmultiple sensitive characteristics. This method also provides a closed-form\nsolution for the optimal, sequentially fair predictor, permitting a clear\ninterpretation of inter-sensitive feature correlations. Our approach seamlessly\nextends to approximate fairness, enveloping a framework accommodating the\ntrade-off between risk and unfairness. This extension permits a targeted\nprioritization of fairness improvements for a specific attribute within a set\nof sensitive attributes, allowing for a case specific adaptation. A data-driven\nestimation procedure for the derived solution is developed, and comprehensive\nnumerical experiments are conducted on both synthetic and real datasets. Our\nempirical findings decisively underscore the practical efficacy of our\npost-processing approach in fostering fair decision-making.\n","authors":["François Hu","Philipp Ratz","Arthur Charpentier"],"pdf_url":"https://arxiv.org/pdf/2309.06627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06626v1","updated":"2023-09-12T22:28:53Z","published":"2023-09-12T22:28:53Z","title":"Accelerating Deep Neural Networks via Semi-Structured Activation\n Sparsity","summary":" The demand for efficient processing of deep neural networks (DNNs) on\nembedded devices is a significant challenge limiting their deployment.\nExploiting sparsity in the network's feature maps is one of the ways to reduce\nits inference latency. It is known that unstructured sparsity results in lower\naccuracy degradation with respect to structured sparsity but the former needs\nextensive inference engine changes to get latency benefits. To tackle this\nchallenge, we propose a solution to induce semi-structured activation sparsity\nexploitable through minor runtime modifications. To attain high speedup levels\nat inference time, we design a sparse training procedure with awareness of the\nfinal position of the activations while computing the General Matrix\nMultiplication (GEMM). We extensively evaluate the proposed solution across\nvarious models for image classification and object detection tasks. Remarkably,\nour approach yields a speed improvement of $1.25 \\times$ with a minimal\naccuracy drop of $1.1\\%$ for the ResNet18 model on the ImageNet dataset.\nFurthermore, when combined with a state-of-the-art structured pruning method,\nthe resulting models provide a good latency-accuracy trade-off, outperforming\nmodels that solely employ structured pruning techniques.\n","authors":["Matteo Grimaldi","Darshan C. Ganji","Ivan Lazarevich","Sudhakar Sah"],"pdf_url":"https://arxiv.org/pdf/2309.06626v1.pdf","comment":"Code is available at http://github.com/Deeplite/activ-sparse"},{"id":"http://arxiv.org/abs/2309.06622v1","updated":"2023-09-12T22:24:05Z","published":"2023-09-12T22:24:05Z","title":"On the Contraction Coefficient of the Schrödinger Bridge for\n Stochastic Linear Systems","summary":" Schr\\\"{o}dinger bridge is a stochastic optimal control problem to steer a\ngiven initial state density to another, subject to controlled diffusion and\ndeadline constraints. A popular method to numerically solve the Schr\\\"{o}dinger\nbridge problems, in both classical and in the linear system settings, is via\ncontractive fixed point recursions. These recursions can be seen as dynamic\nversions of the well-known Sinkhorn iterations, and under mild assumptions,\nthey solve the so-called Schr\\\"{o}dinger systems with guaranteed linear\nconvergence. In this work, we study a priori estimates for the contraction\ncoefficients associated with the convergence of respective Schr\\\"{o}dinger\nsystems. We provide new geometric and control-theoretic interpretations for the\nsame. Building on these newfound interpretations, we point out the possibility\nof improved computation for the worst-case contraction coefficients of linear\nSBPs by preconditioning the endpoint support sets.\n","authors":["Alexis M. H. Teter","Yongxin Chen","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2309.06622v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06619v1","updated":"2023-09-12T22:22:10Z","published":"2023-09-12T22:22:10Z","title":"RT-LM: Uncertainty-Aware Resource Management for Real-Time Inference of\n Language Models","summary":" Recent advancements in language models (LMs) have gained substantial\nattentions on their capability to generate human-like responses. Though\nexhibiting a promising future for various applications such as conversation AI,\nthese LMs face deployment challenges on various devices due to their extreme\ncomputational cost and unpredictable inference latency. Such varied inference\nlatency, identified as a consequence of uncertainty intrinsic to the nature of\nlanguage, can lead to computational inefficiency and degrade the overall\nperformance of LMs, especially under high-traffic workloads. Unfortunately, the\nbandwidth of these uncertainty sources is extensive, complicating the\nprediction of latency and the effects emanating from such uncertainties. To\nunderstand and mitigate the impact of uncertainty on real-time\nresponse-demanding systems, we take the first step to comprehend, quantify and\noptimize these uncertainty-induced latency performance variations in LMs.\nSpecifically, we present RT-LM, an uncertainty-aware resource management\necosystem for real-time inference of LMs. RT-LM innovatively quantifies how\nspecific input uncertainties, adversely affect latency, often leading to an\nincreased output length. Exploiting these insights, we devise a lightweight yet\neffective method to dynamically correlate input text uncertainties with output\nlength at runtime. Utilizing this quantification as a latency heuristic, we\nintegrate the uncertainty information into a system-level scheduler which\nexplores several uncertainty-induced optimization opportunities, including\nuncertainty-aware prioritization, dynamic consolidation, and strategic CPU\noffloading. Quantitative experiments across five state-of-the-art LMs on two\nhardware platforms demonstrates that RT-LM can significantly reduce the average\nresponse time and improve throughput while incurring a rather small runtime\noverhead.\n","authors":["Yufei Li","Zexin Li","Wei Yang","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06619v1.pdf","comment":"Accepted by RTSS 2023"},{"id":"http://arxiv.org/abs/2208.08464v2","updated":"2023-09-12T22:19:00Z","published":"2022-08-17T18:09:19Z","title":"CTRL: Clustering Training Losses for Label Error Detection","summary":" In supervised machine learning, use of correct labels is extremely important\nto ensure high accuracy. Unfortunately, most datasets contain corrupted labels.\nMachine learning models trained on such datasets do not generalize well. Thus,\ndetecting their label errors can significantly increase their efficacy. We\npropose a novel framework, called CTRL (Clustering TRaining Losses for label\nerror detection), to detect label errors in multi-class datasets. It detects\nlabel errors in two steps based on the observation that models learn clean and\nnoisy labels in different ways. First, we train a neural network using the\nnoisy training dataset and obtain the loss curve for each sample. Then, we\napply clustering algorithms to the training losses to group samples into two\ncategories: cleanly-labeled and noisily-labeled. After label error detection,\nwe remove samples with noisy labels and retrain the model. Our experimental\nresults demonstrate state-of-the-art error detection accuracy on both image\n(CIFAR-10 and CIFAR-100) and tabular datasets under simulated noise. We also\nuse a theoretical analysis to provide insights into why CTRL performs so well.\n","authors":["Chang Yue","Niraj K. Jha"],"pdf_url":"https://arxiv.org/pdf/2208.08464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06613v1","updated":"2023-09-12T21:45:33Z","published":"2023-09-12T21:45:33Z","title":"Unsupervised Learning of Nanoindentation Data to Infer Microstructural\n Details of Complex Materials","summary":" In this study, Cu-Cr composites were studied by nanoindentation. Arrays of\nindents were placed over large areas of the samples resulting in datasets\nconsisting of several hundred measurements of Young's modulus and hardness at\nvarying indentation depths. The unsupervised learning technique, Gaussian\nmixture model, was employed to analyze the data, which helped to determine the\nnumber of \"mechanical phases\" and the respective mechanical properties.\nAdditionally, a cross-validation approach was introduced to infer whether the\ndata quantity was adequate and to suggest the amount of data required for\nreliable predictions -- one of the often encountered but difficult to resolve\nissues in machine learning of materials science problems.\n","authors":["Chen Zhang","Clémence Bos","Stefan Sandfeld","Ruth Schwaiger"],"pdf_url":"https://arxiv.org/pdf/2309.06613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06612v1","updated":"2023-09-12T21:37:26Z","published":"2023-09-12T21:37:26Z","title":"Harmonic-NAS: Hardware-Aware Multimodal Neural Architecture Search on\n Resource-constrained Devices","summary":" The recent surge of interest surrounding Multimodal Neural Networks (MM-NN)\nis attributed to their ability to effectively process and integrate information\nfrom diverse data sources. In MM-NN, features are extracted and fused from\nmultiple modalities using adequate unimodal backbones and specific fusion\nnetworks. Although this helps strengthen the multimodal information\nrepresentation, designing such networks is labor-intensive. It requires tuning\nthe architectural parameters of the unimodal backbones, choosing the fusing\npoint, and selecting the operations for fusion. Furthermore, multimodality AI\nis emerging as a cutting-edge option in Internet of Things (IoT) systems where\ninference latency and energy consumption are critical metrics in addition to\naccuracy. In this paper, we propose Harmonic-NAS, a framework for the joint\noptimization of unimodal backbones and multimodal fusion networks with hardware\nawareness on resource-constrained devices. Harmonic-NAS involves a two-tier\noptimization approach for the unimodal backbone architectures and fusion\nstrategy and operators. By incorporating the hardware dimension into the\noptimization, evaluation results on various devices and multimodal datasets\nhave demonstrated the superiority of Harmonic-NAS over state-of-the-art\napproaches achieving up to 10.9% accuracy improvement, 1.91x latency reduction,\nand 2.14x energy efficiency gain.\n","authors":["Mohamed Imed Eddine Ghebriout","Halima Bouzidi","Smail Niar","Hamza Ouarnoughi"],"pdf_url":"https://arxiv.org/pdf/2309.06612v1.pdf","comment":"Accepted to the 15th Asian Conference on Machine Learning (ACML 2023)"},{"id":"http://arxiv.org/abs/2309.06604v1","updated":"2023-09-12T21:07:23Z","published":"2023-09-12T21:07:23Z","title":"Hybrid Algorithm Selection and Hyperparameter Tuning on Distributed\n Machine Learning Resources: A Hierarchical Agent-based Approach","summary":" Algorithm selection and hyperparameter tuning are critical steps in both\nacademic and applied machine learning. On the other hand, these steps are\nbecoming ever increasingly delicate due to the extensive rise in the number,\ndiversity, and distributedness of machine learning resources. Multi-agent\nsystems, when applied to the design of machine learning platforms, bring about\nseveral distinctive characteristics such as scalability, flexibility, and\nrobustness, just to name a few. This paper proposes a fully automatic and\ncollaborative agent-based mechanism for selecting distributedly organized\nmachine learning algorithms and simultaneously tuning their hyperparameters.\nOur method builds upon an existing agent-based hierarchical machine-learning\nplatform and augments its query structure to support the aforementioned\nfunctionalities without being limited to specific learning, selection, and\ntuning mechanisms. We have conducted theoretical assessments, formal\nverification, and analytical study to demonstrate the correctness, resource\nutilization, and computational efficiency of our technique. According to the\nresults, our solution is totally correct and exhibits linear time and space\ncomplexity in relation to the size of available resources. To provide concrete\nexamples of how the proposed methodologies can effectively adapt and perform\nacross a range of algorithmic options and datasets, we have also conducted a\nseries of experiments using a system comprised of 24 algorithms and 9 datasets.\n","authors":["Ahmad Esmaeili","Eric T. Matson","Julia T. Rayz"],"pdf_url":"https://arxiv.org/pdf/2309.06604v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06599v1","updated":"2023-09-12T20:58:21Z","published":"2023-09-12T20:58:21Z","title":"Reasoning with Latent Diffusion in Offline Reinforcement Learning","summary":" Offline reinforcement learning (RL) holds promise as a means to learn\nhigh-reward policies from a static dataset, without the need for further\nenvironment interactions. However, a key challenge in offline RL lies in\neffectively stitching portions of suboptimal trajectories from the static\ndataset while avoiding extrapolation errors arising due to a lack of support in\nthe dataset. Existing approaches use conservative methods that are tricky to\ntune and struggle with multi-modal data (as we show) or rely on noisy Monte\nCarlo return-to-go samples for reward conditioning. In this work, we propose a\nnovel approach that leverages the expressiveness of latent diffusion to model\nin-support trajectory sequences as compressed latent skills. This facilitates\nlearning a Q-function while avoiding extrapolation error via\nbatch-constraining. The latent space is also expressive and gracefully copes\nwith multi-modal data. We show that the learned temporally-abstract latent\nspace encodes richer task-specific information for offline RL tasks as compared\nto raw state-actions. This improves credit assignment and facilitates faster\nreward propagation during Q-learning. Our method demonstrates state-of-the-art\nperformance on the D4RL benchmarks, particularly excelling in long-horizon,\nsparse-reward tasks.\n","authors":["Siddarth Venkatraman","Shivesh Khaitan","Ravi Tej Akella","John Dolan","Jeff Schneider","Glen Berseth"],"pdf_url":"https://arxiv.org/pdf/2309.06599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15321v3","updated":"2023-09-12T20:51:09Z","published":"2023-08-29T14:16:09Z","title":"Elucidating the Exposure Bias in Diffusion Models","summary":" Diffusion models have demonstrated impressive generative capabilities, but\ntheir 'exposure bias' problem, described as the input mismatch between training\nand sampling, lacks in-depth exploration. In this paper, we systematically\ninvestigate the exposure bias problem in diffusion models by first analytically\nmodelling the sampling distribution, based on which we then attribute the\nprediction error at each sampling step as the root cause of the exposure bias\nissue. Furthermore, we discuss potential solutions to this issue and propose an\nintuitive metric for it. Along with the elucidation of exposure bias, we\npropose a simple, yet effective, training-free method called Epsilon Scaling to\nalleviate the exposure bias. We show that Epsilon Scaling explicitly moves the\nsampling trajectory closer to the vector field learned in the training phase by\nscaling down the network output (Epsilon), mitigating the input mismatch\nbetween training and sampling. Experiments on various diffusion frameworks\n(ADM, DDPM/DDIM, EDM, LDM), unconditional and conditional settings, and\ndeterministic vs. stochastic sampling verify the effectiveness of our method.\nThe code is available at https://github.com/forever208/ADM-ES;\nhttps://github.com/forever208/EDM-ES\n","authors":["Mang Ning","Mingxiao Li","Jianlin Su","Albert Ali Salah","Itir Onal Ertugrul"],"pdf_url":"https://arxiv.org/pdf/2308.15321v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2309.06597v1","updated":"2023-09-12T20:51:07Z","published":"2023-09-12T20:51:07Z","title":"Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and\n Reasoning","summary":" The widespread adoption of commercial autonomous vehicles (AVs) and advanced\ndriver assistance systems (ADAS) may largely depend on their acceptance by\nsociety, for which their perceived trustworthiness and interpretability to\nriders are crucial. In general, this task is challenging because modern\nautonomous systems software relies heavily on black-box artificial intelligence\nmodels. Towards this goal, this paper introduces a novel dataset, Rank2Tell, a\nmulti-modal ego-centric dataset for Ranking the importance level and Telling\nthe reason for the importance. Using various close and open-ended visual\nquestion answering, the dataset provides dense annotations of various semantic,\nspatial, temporal, and relational attributes of various important objects in\ncomplex traffic scenarios. The dense annotations and unique attributes of the\ndataset make it a valuable resource for researchers working on visual scene\nunderstanding and related fields. Further, we introduce a joint model for joint\nimportance level ranking and natural language captions generation to benchmark\nour dataset and demonstrate performance with quantitative evaluations.\n","authors":["Enna Sachdeva","Nakul Agarwal","Suhas Chundi","Sean Roelofs","Jiachen Li","Behzad Dariush","Chiho Choi","Mykel Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2309.06597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06588v1","updated":"2023-09-12T20:24:37Z","published":"2023-09-12T20:24:37Z","title":"Convergence of Gradient-based MAML in LQR","summary":" The main objective of this research paper is to investigate the local\nconvergence characteristics of Model-agnostic Meta-learning (MAML) when applied\nto linear system quadratic optimal control (LQR). MAML and its variations have\nbecome popular techniques for quickly adapting to new tasks by leveraging\nprevious learning knowledge in areas like regression, classification, and\nreinforcement learning. However, its theoretical guarantees remain unknown due\nto non-convexity and its structure, making it even more challenging to ensure\nstability in the dynamic system setting. This study focuses on exploring MAML\nin the LQR setting, providing its local convergence guarantees while\nmaintaining the stability of the dynamical system. The paper also presents\nsimple numerical results to demonstrate the convergence properties of MAML in\nLQR tasks.\n","authors":["Negin Musavi","Geir E. Dullerud"],"pdf_url":"https://arxiv.org/pdf/2309.06588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05153v2","updated":"2023-09-12T20:23:34Z","published":"2023-09-10T22:05:24Z","title":"Learning Energy-Based Models by Cooperative Diffusion Recovery\n Likelihood","summary":" Training energy-based models (EBMs) with maximum likelihood estimation on\nhigh-dimensional data can be both challenging and time-consuming. As a result,\nthere a noticeable gap in sample quality between EBMs and other generative\nframeworks like GANs and diffusion models. To close this gap, inspired by the\nrecent efforts of learning EBMs by maximimizing diffusion recovery likelihood\n(DRL), we propose cooperative diffusion recovery likelihood (CDRL), an\neffective approach to tractably learn and sample from a series of EBMs defined\non increasingly noisy versons of a dataset, paired with an initializer model\nfor each EBM. At each noise level, the initializer model learns to amortize the\nsampling process of the EBM, and the two models are jointly estimated within a\ncooperative training framework. Samples from the initializer serve as starting\npoints that are refined by a few sampling steps from the EBM. With the refined\nsamples, the EBM is optimized by maximizing recovery likelihood, while the\ninitializer is optimized by learning from the difference between the refined\nsamples and the initial samples. We develop a new noise schedule and a variance\nreduction technique to further improve the sample quality. Combining these\nadvances, we significantly boost the FID scores compared to existing EBM\nmethods on CIFAR-10 and ImageNet 32x32, with a 2x speedup over DRL. In\naddition, we extend our method to compositional generation and image inpainting\ntasks, and showcase the compatibility of CDRL with classifier-free guidance for\nconditional generation, achieving similar trade-offs between sample quality and\nsample diversity as in diffusion models.\n","authors":["Yaxuan Zhu","Jianwen Xie","Yingnian Wu","Ruiqi Gao"],"pdf_url":"https://arxiv.org/pdf/2309.05153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05605v2","updated":"2023-09-12T20:18:20Z","published":"2023-09-11T16:39:30Z","title":"Memory Injections: Correcting Multi-Hop Reasoning Failures during\n Inference in Transformer-Based Language Models","summary":" Answering multi-hop reasoning questions requires retrieving and synthesizing\ninformation from diverse sources. Large Language Models (LLMs) struggle to\nperform such reasoning consistently. Here we propose an approach to pinpoint\nand rectify multi-hop reasoning failures through targeted memory injections on\nLLM attention heads. First, we analyze the per-layer activations of GPT-2\nmodels in response to single and multi-hop prompts. We then propose a mechanism\nthat allows users to inject pertinent prompt-specific information, which we\nrefer to as \"memories,\" at critical LLM locations during inference. By thus\nenabling the LLM to incorporate additional relevant information during\ninference, we enhance the quality of multi-hop prompt completions. We show\nempirically that a simple, efficient, and targeted memory injection into a key\nattention layer can often increase the probability of the desired next token in\nmulti-hop tasks, by up to 424%.\n","authors":["Mansi Sakarvadia","Aswathy Ajith","Arham Khan","Daniel Grzenda","Nathaniel Hudson","André Bauer","Kyle Chard","Ian Foster"],"pdf_url":"https://arxiv.org/pdf/2309.05605v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01516v2","updated":"2023-09-12T20:16:04Z","published":"2023-09-04T10:48:29Z","title":"MultiWay-Adapater: Adapting large-scale multi-modal models for scalable\n image-text retrieval","summary":" As the size of Large Multi-Modal Models (LMMs) increases consistently, the\nadaptation of these pre-trained models to specialized tasks has become a\ncomputationally and memory-intensive challenge. Traditional fine-tuning methods\nrequire isolated, exhaustive retuning for each new task, limiting the models'\nversatility. Moreover, current efficient adaptation techniques often overlook\nmodality alignment, focusing only on the knowledge extraction of new tasks. To\ntackle these issues, we introduce Multiway-Adapter, an innovative framework\nincorporating an 'Alignment Enhancer' to deepen modality alignment, enabling\nhigh transferability without tuning pre-trained parameters. Our method adds\nfewer than 1.25\\% of additional parameters to LMMs, exemplified by the BEiT-3\nmodel in our study. This leads to superior zero-shot image-text retrieval\nperformance compared to fully fine-tuned models, while achieving up to a 57\\%\nreduction in fine-tuning time. Our approach offers a resource-efficient and\neffective adaptation pathway for LMMs, broadening their applicability. The\nsource code is publicly available at:\n\\url{https://github.com/longkukuhi/MultiWay-Adapter}.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa"],"pdf_url":"https://arxiv.org/pdf/2309.01516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06584v1","updated":"2023-09-12T20:12:08Z","published":"2023-09-12T20:12:08Z","title":"Explainable Graph Neural Network for Alzheimer's Disease And Related\n Dementias Risk Prediction","summary":" Alzheimer's disease and related dementias (ADRD) ranks as the sixth leading\ncause of death in the US, underlining the importance of accurate ADRD risk\nprediction. While recent advancement in ADRD risk prediction have primarily\nrelied on imaging analysis, yet not all patients undergo medical imaging before\nan ADRD diagnosis. Merging machine learning with claims data can reveal\nadditional risk factors and uncover interconnections among diverse medical\ncodes. Our goal is to utilize Graph Neural Networks (GNNs) with claims data for\nADRD risk prediction. Addressing the lack of human-interpretable reasons behind\nthese predictions, we introduce an innovative method to evaluate relationship\nimportance and its influence on ADRD risk prediction, ensuring comprehensive\ninterpretation.\n We employed Variationally Regularized Encoder-decoder Graph Neural Network\n(VGNN) for estimating ADRD likelihood. We created three scenarios to assess the\nmodel's efficiency, using Random Forest and Light Gradient Boost Machine as\nbaselines. We further used our relation importance method to clarify the key\nrelationships for ADRD risk prediction. VGNN surpassed other baseline models by\n10% in the area under the receiver operating characteristic. The integration of\nthe GNN model and relation importance interpretation could potentially play an\nessential role in providing valuable insight into factors that may contribute\nto or delay ADRD progression.\n Employing a GNN approach with claims data enhances ADRD risk prediction and\nprovides insights into the impact of interconnected medical code relationships.\nThis methodology not only enables ADRD risk modeling but also shows potential\nfor other image analysis predictions using claims data.\n","authors":["Xinyue Hu","Zenan Sun","Yi Nian","Yifang Dang","Fang Li","Jingna Feng","Evan Yu","Cui Tao"],"pdf_url":"https://arxiv.org/pdf/2309.06584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06582v1","updated":"2023-09-12T20:09:59Z","published":"2023-09-12T20:09:59Z","title":"Electron Energy Regression in the CMS High-Granularity Calorimeter\n Prototype","summary":" We present a new publicly available dataset that contains simulated data of a\nnovel calorimeter to be installed at the CERN Large Hadron Collider. This\ndetector will have more than six-million channels with each channel capable of\nposition, ionisation and precision time measurement. Reconstructing these\nevents in an efficient way poses an immense challenge which is being addressed\nwith the latest machine learning techniques. As part of this development a\nlarge prototype with 12,000 channels was built and a beam of high-energy\nelectrons incident on it. Using machine learning methods we have reconstructed\nthe energy of incident electrons from the energies of three-dimensional hits,\nwhich is known to some precision. By releasing this data publicly we hope to\nencourage experts in the application of machine learning to develop efficient\nand accurate image reconstruction of these electrons.\n","authors":["Roger Rusack","Bhargav Joshi","Alpana Alpana","Seema Sharma","Thomas Vadnais"],"pdf_url":"https://arxiv.org/pdf/2309.06582v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2302.05185v4","updated":"2023-09-12T20:09:08Z","published":"2023-02-10T11:30:19Z","title":"On Penalty-based Bilevel Gradient Descent Method","summary":" Bilevel optimization enjoys a wide range of applications in hyper-parameter\noptimization, meta-learning and reinforcement learning. However, bilevel\noptimization problems are difficult to solve. Recent progress on scalable\nbilevel algorithms mainly focuses on bilevel optimization problems where the\nlower-level objective is either strongly convex or unconstrained. In this work,\nwe tackle the bilevel problem through the lens of the penalty method. We show\nthat under certain conditions, the penalty reformulation recovers the solutions\nof the original bilevel problem. Further, we propose the penalty-based bilevel\ngradient descent (PBGD) algorithm and establish its finite-time convergence for\nthe constrained bilevel problem without lower-level strong convexity.\nExperiments showcase the efficiency of the proposed PBGD algorithm.\n","authors":["Han Shen","Quan Xiao","Tianyi Chen"],"pdf_url":"https://arxiv.org/pdf/2302.05185v4.pdf","comment":"Improved Section 4 by removing a critical assumption; Added Section 5\n and citations"},{"id":"http://arxiv.org/abs/2309.06569v1","updated":"2023-09-12T20:04:16Z","published":"2023-09-12T20:04:16Z","title":"Promises of Deep Kernel Learning for Control Synthesis","summary":" Deep Kernel Learning (DKL) combines the representational power of neural\nnetworks with the uncertainty quantification of Gaussian Processes. Hence, it\nis potentially a promising tool to learn and control complex dynamical systems.\nIn this work, we develop a scalable abstraction-based framework that enables\nthe use of DKL for control synthesis of stochastic dynamical systems against\ncomplex specifications. Specifically, we consider temporal logic specifications\nand create an end-to-end framework that uses DKL to learn an unknown system\nfrom data and formally abstracts the DKL model into an Interval Markov Decision\nProcess (IMDP) to perform control synthesis with correctness guarantees.\nFurthermore, we identify a deep architecture that enables accurate learning and\nefficient abstraction computation. The effectiveness of our approach is\nillustrated on various benchmarks, including a 5-D nonlinear stochastic system,\nshowing how control synthesis with DKL can substantially outperform\nstate-of-the-art competitive methods.\n","authors":["Robert Reed","Luca Laurenti","Morteza Lahijanian"],"pdf_url":"https://arxiv.org/pdf/2309.06569v1.pdf","comment":"9 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.06551v1","updated":"2023-09-12T19:52:27Z","published":"2023-09-12T19:52:27Z","title":"Commands as AI Conversations","summary":" Developers and data scientists often struggle to write command-line inputs,\neven though graphical interfaces or tools like ChatGPT can assist. The\nsolution? \"ai-cli,\" an open-source system inspired by GitHub Copilot that\nconverts natural language prompts into executable commands for various Linux\ncommand-line tools. By tapping into OpenAI's API, which allows interaction\nthrough JSON HTTP requests, \"ai-cli\" transforms user queries into actionable\ncommand-line instructions. However, integrating AI assistance across multiple\ncommand-line tools, especially in open source settings, can be complex.\nHistorically, operating systems could mediate, but individual tool\nfunctionality and the lack of a unified approach have made centralized\nintegration challenging. The \"ai-cli\" tool, by bridging this gap through\ndynamic loading and linking with each program's Readline library API, makes\ncommand-line interfaces smarter and more user-friendly, opening avenues for\nfurther enhancement and cross-platform applicability.\n","authors":["Diomidis Spinellis"],"pdf_url":"https://arxiv.org/pdf/2309.06551v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2309.06534v1","updated":"2023-09-12T19:18:52Z","published":"2023-09-12T19:18:52Z","title":"Distributionally Robust Transfer Learning","summary":" Many existing transfer learning methods rely on leveraging information from\nsource data that closely resembles the target data. However, this approach\noften overlooks valuable knowledge that may be present in different yet\npotentially related auxiliary samples. When dealing with a limited amount of\ntarget data and a diverse range of source models, our paper introduces a novel\napproach, Distributionally Robust Optimization for Transfer Learning\n(TransDRO), that breaks free from strict similarity constraints. TransDRO is\ndesigned to optimize the most adversarial loss within an uncertainty set,\ndefined as a collection of target populations generated as a convex combination\nof source distributions that guarantee excellent prediction performances for\nthe target data. TransDRO effectively bridges the realms of transfer learning\nand distributional robustness prediction models. We establish the\nidentifiability of TransDRO and its interpretation as a weighted average of\nsource models closest to the baseline model. We also show that TransDRO\nachieves a faster convergence rate than the model fitted with the target data.\nOur comprehensive numerical studies and analysis of multi-institutional\nelectronic health records data using TransDRO further substantiate the\nrobustness and accuracy of TransDRO, highlighting its potential as a powerful\ntool in transfer learning applications.\n","authors":["Xin Xiong","Zijian Guo","Tianxi Cai"],"pdf_url":"https://arxiv.org/pdf/2309.06534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06533v1","updated":"2023-09-12T19:11:34Z","published":"2023-09-12T19:11:34Z","title":"Hierarchical Multi-Task Learning Framework for Session-based\n Recommendations","summary":" While session-based recommender systems (SBRSs) have shown superior\nrecommendation performance, multi-task learning (MTL) has been adopted by SBRSs\nto enhance their prediction accuracy and generalizability further. Hierarchical\nMTL (H-MTL) sets a hierarchical structure between prediction tasks and feeds\noutputs from auxiliary tasks to main tasks. This hierarchy leads to richer\ninput features for main tasks and higher interpretability of predictions,\ncompared to existing MTL frameworks. However, the H-MTL framework has not been\ninvestigated in SBRSs yet. In this paper, we propose HierSRec which\nincorporates the H-MTL architecture into SBRSs. HierSRec encodes a given\nsession with a metadata-aware Transformer and performs next-category prediction\n(i.e., auxiliary task) with the session encoding. Next, HierSRec conducts\nnext-item prediction (i.e., main task) with the category prediction result and\nsession encoding. For scalable inference, HierSRec creates a compact set of\ncandidate items (e.g., 4% of total items) per test example using the category\nprediction. Experiments show that HierSRec outperforms existing SBRSs as per\nnext-item prediction accuracy on two session-based recommendation datasets. The\naccuracy of HierSRec measured with the carefully-curated candidate items aligns\nwith the accuracy of HierSRec calculated with all items, which validates the\nusefulness of our candidate generation scheme via H-MTL.\n","authors":["Sejoon Oh","Walid Shalaby","Amir Afsharinejad","Xiquan Cui"],"pdf_url":"https://arxiv.org/pdf/2309.06533v1.pdf","comment":"Accepted at the 6th Workshop on Online Recommender Systems and User\n Modeling @ ACM RecSys 2023"},{"id":"http://arxiv.org/abs/2309.06526v1","updated":"2023-09-12T19:08:26Z","published":"2023-09-12T19:08:26Z","title":"Exploring the Benefits of Differentially Private Pre-training and\n Parameter-Efficient Fine-tuning for Table Transformers","summary":" For machine learning with tabular data, Table Transformer (TabTransformer) is\na state-of-the-art neural network model, while Differential Privacy (DP) is an\nessential component to ensure data privacy. In this paper, we explore the\nbenefits of combining these two aspects together in the scenario of transfer\nlearning -- differentially private pre-training and fine-tuning of\nTabTransformers with a variety of parameter-efficient fine-tuning (PEFT)\nmethods, including Adapter, LoRA, and Prompt Tuning. Our extensive experiments\non the ACSIncome dataset show that these PEFT methods outperform traditional\napproaches in terms of the accuracy of the downstream task and the number of\ntrainable parameters, thus achieving an improved trade-off among parameter\nefficiency, privacy, and accuracy. Our code is available at\ngithub.com/IBM/DP-TabTransformer.\n","authors":["Xilong Wang","Chia-Mu Yu","Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06526v1.pdf","comment":"submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2306.10313v2","updated":"2023-09-12T19:05:59Z","published":"2023-06-17T10:32:02Z","title":"Adversaries with Limited Information in the Friedkin--Johnsen Model","summary":" In recent years, online social networks have been the target of adversaries\nwho seek to introduce discord into societies, to undermine democracies and to\ndestabilize communities. Often the goal is not to favor a certain side of a\nconflict but to increase disagreement and polarization. To get a mathematical\nunderstanding of such attacks, researchers use opinion-formation models from\nsociology, such as the Friedkin--Johnsen model, and formally study how much\ndiscord the adversary can produce when altering the opinions for only a small\nset of users. In this line of work, it is commonly assumed that the adversary\nhas full knowledge about the network topology and the opinions of all users.\nHowever, the latter assumption is often unrealistic in practice, where user\nopinions are not available or simply difficult to estimate accurately.\n To address this concern, we raise the following question: Can an attacker sow\ndiscord in a social network, even when only the network topology is known? We\nanswer this question affirmatively. We present approximation algorithms for\ndetecting a small set of users who are highly influential for the disagreement\nand polarization in the network. We show that when the adversary radicalizes\nthese users and if the initial disagreement/polarization in the network is not\nvery high, then our method gives a constant-factor approximation on the setting\nwhen the user opinions are known. To find the set of influential users, we\nprovide a novel approximation algorithm for a variant of MaxCut in graphs with\npositive and negative edge weights. We experimentally evaluate our methods,\nwhich have access only to the network topology, and we find that they have\nsimilar performance as methods that have access to the network topology and all\nuser opinions. We further present an NP-hardness proof, which was an open\nquestion by Chen and Racz [IEEE Trans. Netw. Sci. Eng., 2021].\n","authors":["Sijing Tu","Stefan Neumann","Aristides Gionis"],"pdf_url":"https://arxiv.org/pdf/2306.10313v2.pdf","comment":"KDD'23"},{"id":"http://arxiv.org/abs/2304.06104v3","updated":"2023-09-12T19:03:48Z","published":"2023-04-12T18:37:52Z","title":"Primal-Dual Contextual Bayesian Optimization for Control System Online\n Optimization with Time-Average Constraints","summary":" This paper studies the problem of online performance optimization of\nconstrained closed-loop control systems, where both the objective and the\nconstraints are unknown black-box functions affected by exogenous time-varying\ncontextual disturbances. A primal-dual contextual Bayesian optimization\nalgorithm is proposed that achieves sublinear cumulative regret with respect to\nthe dynamic optimal solution under certain regularity conditions. Furthermore,\nthe algorithm achieves zero time-average constraint violation, ensuring that\nthe average value of the constraint function satisfies the desired constraint.\nThe method is applied to both sampled instances from Gaussian processes and a\ncontinuous stirred tank reactor parameter tuning problem; simulation results\nshow that the method simultaneously provides close-to-optimal performance and\nmaintains constraint feasibility on average. This contrasts current\nstate-of-the-art methods, which either suffer from large cumulative regret or\nsevere constraint violations for the case studies presented.\n","authors":["Wenjie Xu","Yuning Jiang","Bratislav Svetozarevic","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2304.06104v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06732v5","updated":"2023-09-12T18:51:05Z","published":"2023-01-17T07:22:54Z","title":"Solar Coronal Hole Analysis and Prediction using Computer Vision and\n LSTM Neural Network","summary":" As humanity has begun to explore space, the significance of space weather has\nbecome apparent. It has been established that coronal holes, a type of space\nweather phenomenon, can impact the operation of aircraft and satellites. The\ncoronal hole is an area on the sun characterized by open magnetic field lines\nand relatively low temperatures, which result in the emission of the solar wind\nat higher than average rates. In this study, To prepare for the impact of\ncoronal holes on the Earth, we use computer vision to detect the coronal hole\nregion and calculate its size based on images from the Solar Dynamics\nObservatory (SDO). We compare the coronal holes for each region of the Sun and\nanalyze the correlation. We then implement deep learning techniques,\nspecifically the Long Short-Term Memory (LSTM) method, to analyze trends in the\ncoronal hole area data and predict its size for different sun regions over 7\ndays. By analyzing time series data on the coronal hole area, this study aims\nto identify patterns and trends in coronal hole behavior and understand how\nthey may impact space weather events. This research represents an important\nstep towards improving our ability to predict and prepare for space weather\nevents that can affect Earth and technological systems.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2301.06732v5.pdf","comment":"This is old technology"},{"id":"http://arxiv.org/abs/2309.06519v1","updated":"2023-09-12T18:50:24Z","published":"2023-09-12T18:50:24Z","title":"A Q-learning Approach for Adherence-Aware Recommendations","summary":" In many real-world scenarios involving high-stakes and safety implications, a\nhuman decision-maker (HDM) may receive recommendations from an artificial\nintelligence while holding the ultimate responsibility of making decisions. In\nthis letter, we develop an \"adherence-aware Q-learning\" algorithm to address\nthis problem. The algorithm learns the \"adherence level\" that captures the\nfrequency with which an HDM follows the recommended actions and derives the\nbest recommendation policy in real time. We prove the convergence of the\nproposed Q-learning algorithm to the optimal value and evaluate its performance\nacross various scenarios.\n","authors":["Ioannis Faros","Aditya Dave","Andreas A. Malikopoulos"],"pdf_url":"https://arxiv.org/pdf/2309.06519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06503v1","updated":"2023-09-12T18:18:23Z","published":"2023-09-12T18:18:23Z","title":"Leveraging Large Language Models and Weak Supervision for Social Media\n data annotation: an evaluation using COVID-19 self-reported vaccination\n tweets","summary":" The COVID-19 pandemic has presented significant challenges to the healthcare\nindustry and society as a whole. With the rapid development of COVID-19\nvaccines, social media platforms have become a popular medium for discussions\non vaccine-related topics. Identifying vaccine-related tweets and analyzing\nthem can provide valuable insights for public health research-ers and\npolicymakers. However, manual annotation of a large number of tweets is\ntime-consuming and expensive. In this study, we evaluate the usage of Large\nLanguage Models, in this case GPT-4 (March 23 version), and weak supervision,\nto identify COVID-19 vaccine-related tweets, with the purpose of comparing\nperformance against human annotators. We leveraged a manu-ally curated\ngold-standard dataset and used GPT-4 to provide labels without any additional\nfine-tuning or instructing, in a single-shot mode (no additional prompting).\n","authors":["Ramya Tekumalla","Juan M. Banda"],"pdf_url":"https://arxiv.org/pdf/2309.06503v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06497v1","updated":"2023-09-12T18:11:10Z","published":"2023-09-12T18:11:10Z","title":"A Distributed Data-Parallel PyTorch Implementation of the Distributed\n Shampoo Optimizer for Training Neural Networks At-Scale","summary":" Shampoo is an online and stochastic optimization algorithm belonging to the\nAdaGrad family of methods for training neural networks. It constructs a\nblock-diagonal preconditioner where each block consists of a coarse Kronecker\nproduct approximation to full-matrix AdaGrad for each parameter of the neural\nnetwork. In this work, we provide a complete description of the algorithm as\nwell as the performance optimizations that our implementation leverages to\ntrain deep networks at-scale in PyTorch. Our implementation enables fast\nmulti-GPU distributed data-parallel training by distributing the memory and\ncomputation associated with blocks of each parameter via PyTorch's DTensor data\nstructure and performing an AllGather primitive on the computed search\ndirections at each iteration. This major performance enhancement enables us to\nachieve at most a 10% performance reduction in per-step wall-clock time\ncompared against standard diagonal-scaling-based adaptive gradient methods. We\nvalidate our implementation by performing an ablation study on training\nImageNet ResNet50, demonstrating Shampoo's superiority over standard training\nrecipes with minimal hyperparameter tuning.\n","authors":["Hao-Jun Michael Shi","Tsung-Hsien Lee","Shintaro Iwasaki","Jose Gallego-Posada","Zhijing Li","Kaushik Rangadurai","Dheevatsa Mudigere","Michael Rabbat"],"pdf_url":"https://arxiv.org/pdf/2309.06497v1.pdf","comment":"38 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2309.06484v1","updated":"2023-09-12T18:00:27Z","published":"2023-09-12T18:00:27Z","title":"Learning topological operations on meshes with application to block\n decomposition of polygons","summary":" We present a learning based framework for mesh quality improvement on\nunstructured triangular and quadrilateral meshes. Our model learns to improve\nmesh quality according to a prescribed objective function purely via self-play\nreinforcement learning with no prior heuristics. The actions performed on the\nmesh are standard local and global element operations. The goal is to minimize\nthe deviation of the node degrees from their ideal values, which in the case of\ninterior vertices leads to a minimization of irregular nodes.\n","authors":["Arjun Narayanan","Yulong Pan","Per-Olof Persson"],"pdf_url":"https://arxiv.org/pdf/2309.06484v1.pdf","comment":"Submitted to Computer-Aided Design Journal. Presented at 17th US\n National Conference on Computational Mechanics, Albuquerque, NM"},{"id":"http://arxiv.org/abs/2309.06472v1","updated":"2023-09-12T18:00:01Z","published":"2023-09-12T18:00:01Z","title":"Flows for Flows: Morphing one Dataset into another with Maximum\n Likelihood Estimation","summary":" Many components of data analysis in high energy physics and beyond require\nmorphing one dataset into another. This is commonly solved via reweighting, but\nthere are many advantages of preserving weights and shifting the data points\ninstead. Normalizing flows are machine learning models with impressive\nprecision on a variety of particle physics tasks. Naively, normalizing flows\ncannot be used for morphing because they require knowledge of the probability\ndensity of the starting dataset. In most cases in particle physics, we can\ngenerate more examples, but we do not know densities explicitly. We propose a\nprotocol called flows for flows for training normalizing flows to morph one\ndataset into another even if the underlying probability density of neither\ndataset is known explicitly. This enables a morphing strategy trained with\nmaximum likelihood estimation, a setup that has been shown to be highly\neffective in related tasks. We study variations on this protocol to explore how\nfar the data points are moved to statistically match the two datasets.\nFurthermore, we show how to condition the learned flows on particular features\nin order to create a morphing function for every value of the conditioning\nfeature. For illustration, we demonstrate flows for flows for toy examples as\nwell as a collider physics example involving dijet events\n","authors":["Tobias Golling","Samuel Klein","Radha Mastandrea","Benjamin Nachman","John Andrew Raine"],"pdf_url":"https://arxiv.org/pdf/2309.06472v1.pdf","comment":"15 pages, 17 figures. This work is a merger of arXiv:2211.02487 and\n arXiv:2212.06155"},{"id":"http://arxiv.org/abs/2309.06424v1","updated":"2023-09-12T17:40:49Z","published":"2023-09-12T17:40:49Z","title":"Unveiling the potential of large language models in generating semantic\n and cross-language clones","summary":" Semantic and Cross-language code clone generation may be useful for code\nreuse, code comprehension, refactoring and benchmarking. OpenAI's GPT model has\npotential in such clone generation as GPT is used for text generation. When\ndevelopers copy/paste codes from Stack Overflow (SO) or within a system, there\nmight be inconsistent changes leading to unexpected behaviours. Similarly, if\nsomeone possesses a code snippet in a particular programming language but seeks\nequivalent functionality in a different language, a semantic cross-language\ncode clone generation approach could provide valuable assistance. In this\nstudy, using SemanticCloneBench as a vehicle, we evaluated how well the GPT-3\nmodel could help generate semantic and cross-language clone variants for a\ngiven fragment.We have comprised a diverse set of code fragments and assessed\nGPT-3s performance in generating code variants.Through extensive\nexperimentation and analysis, where 9 judges spent 158 hours to validate, we\ninvestigate the model's ability to produce accurate and semantically correct\nvariants. Our findings shed light on GPT-3's strengths in code generation,\noffering insights into the potential applications and challenges of using\nadvanced language models in software development. Our quantitative analysis\nyields compelling results. In the realm of semantic clones, GPT-3 attains an\nimpressive accuracy of 62.14% and 0.55 BLEU score, achieved through few-shot\nprompt engineering. Furthermore, the model shines in transcending linguistic\nconfines, boasting an exceptional 91.25% accuracy in generating cross-language\nclones\n","authors":["Palash R. Roy","Ajmain I. Alam","Farouq Al-omari","Banani Roy","Chanchal K. Roy","Kevin A. Schneider"],"pdf_url":"https://arxiv.org/pdf/2309.06424v1.pdf","comment":"Accepted in IWSC"},{"id":"http://arxiv.org/abs/2110.07292v4","updated":"2023-09-12T17:01:20Z","published":"2021-10-14T11:57:57Z","title":"Sign and Relevance Learning","summary":" Standard models of biologically realistic or biologically inspired\nreinforcement learning employ a global error signal, which implies the use of\nshallow networks. On the other hand, error backpropagation allows the use of\nnetworks with multiple layers. However, precise error backpropagation is\ndifficult to justify in biologically realistic networks because it requires\nprecise weighted error backpropagation from layer to layer. In this study, we\nintroduce a novel network that solves this problem by propagating only the sign\nof the plasticity change (i.e., LTP/LTD) throughout the whole network, while\nneuromodulation controls the learning rate. Neuromodulation can be understood\nas a rectified error or relevance signal, while the top-down sign of the error\nsignal determines whether long-term potentiation or long-term depression will\noccur. To demonstrate the effectiveness of this approach, we conducted a real\nrobotic task as proof of concept. Our results show that this paradigm can\nsuccessfully perform complex tasks using a biologically plausible learning\nmechanism.\n","authors":["Sama Daryanavard","Bernd Porr"],"pdf_url":"https://arxiv.org/pdf/2110.07292v4.pdf","comment":"14 pages, 15 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.06284v1","updated":"2023-09-12T14:43:47Z","published":"2023-09-12T14:43:47Z","title":"Fg-T2M: Fine-Grained Text-Driven Human Motion Generation via Diffusion\n Model","summary":" Text-driven human motion generation in computer vision is both significant\nand challenging. However, current methods are limited to producing either\ndeterministic or imprecise motion sequences, failing to effectively control the\ntemporal and spatial relationships required to conform to a given text\ndescription. In this work, we propose a fine-grained method for generating\nhigh-quality, conditional human motion sequences supporting precise text\ndescription. Our approach consists of two key components: 1) a\nlinguistics-structure assisted module that constructs accurate and complete\nlanguage feature to fully utilize text information; and 2) a context-aware\nprogressive reasoning module that learns neighborhood and overall semantic\nlinguistics features from shallow and deep graph neural networks to achieve a\nmulti-step inference. Experiments show that our approach outperforms\ntext-driven motion generation methods on HumanML3D and KIT test sets and\ngenerates better visually confirmed motion to the text conditions.\n","authors":["Yin Wang","Zhiying Leng","Frederick W. B. Li","Shun-Cheng Wu","Xiaohui Liang"],"pdf_url":"https://arxiv.org/pdf/2309.06284v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06255v1","updated":"2023-09-12T14:16:34Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multi-modal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multi-modal cooperation, which could not jointly\nutilize all modalities well. Some methods are proposed to identify and enhance\nthe worse learnt modality, but are often hard to provide the fine-grained\nobservation of multi-modal cooperation at sample-level with theoretical\nsupport. Hence, it is essential to reasonably observe and improve the\nfine-grained cooperation between modalities, especially when facing realistic\nscenarios where the modality discrepancy could vary across different samples.\nTo this end, we introduce a fine-grained modality valuation metric to evaluate\nthe contribution of each modality at sample-level. Via modality valuation, we\nregretfully observe that the multi-modal model tends to rely on one specific\nmodality, resulting in other modalities being low-contributing. We further\nanalyze this issue and improve cooperation between modalities by enhancing the\ndiscriminative ability of low-contributing modalities in a targeted manner.\nOverall, our methods reasonably observe the fine-grained uni-modal contribution\nat sample-level and achieve considerable improvement on different multi-modal\nmodels.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2211.07440v2","updated":"2023-09-12T14:07:13Z","published":"2022-11-14T15:14:50Z","title":"Leveraging Automatic Personalised Nutrition: Food Image Recognition\n Benchmark and Dataset based on Nutrition Taxonomy","summary":" Leading a healthy lifestyle has become one of the most challenging goals in\ntoday's society due to our sedentary lifestyle and poor eating habits. As a\nresult, national and international organisms have made numerous efforts to\npromote healthier food diets and physical activity habits. However, these\nrecommendations are sometimes difficult to follow in our daily life and they\nare also based on a general population. As a consequence, a new area of\nresearch, personalised nutrition, has been conceived focusing on individual\nsolutions through smart devices and Artificial Intelligence (AI) methods.\n This study presents the AI4Food-NutritionDB database, the first nutrition\ndatabase that considers food images and a nutrition taxonomy based on\nrecommendations by national and international organisms. In addition, four\ndifferent categorisation levels are considered following nutrition experts: 6\nnutritional levels, 19 main categories (e.g., \"Meat\"), 73 subcategories (e.g.,\n\"White Meat\"), and 893 final food products (e.g., \"Chicken\"). The\nAI4Food-NutritionDB opens the doors to new food computing approaches in terms\nof food intake frequency, quality, and categorisation. Also, in addition to the\ndatabase, we propose a standard experimental protocol and benchmark including\nthree tasks based on the nutrition taxonomy (i.e., category, subcategory, and\nfinal product) to be used for the research community. Finally, we also release\nour Deep Learning models trained with the AI4Food-NutritionDB, which can be\nused as pre-trained models, achieving accurate recognition results with\nchallenging food image databases.\n","authors":["Sergio Romero-Tapiador","Ruben Tolosana","Aythami Morales","Isabel Espinosa-Salinas","Gala Freixer","Julian Fierrez","Ruben Vera-Rodriguez","Enrique Carrillo de Santa Pau","Ana Ramírez de Molina","Javier Ortega-Garcia"],"pdf_url":"https://arxiv.org/pdf/2211.07440v2.pdf","comment":"10 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.06176v1","updated":"2023-09-12T12:43:50Z","published":"2023-09-12T12:43:50Z","title":"Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding","summary":" Make-up temporal video grounding (MTVG) aims to localize the target video\nsegment which is semantically related to a sentence describing a make-up\nactivity, given a long video. Compared with the general video grounding task,\nMTVG focuses on meticulous actions and changes on the face. The make-up\ninstruction step, usually involving detailed differences in products and facial\nareas, is more fine-grained than general activities (e.g, cooking activity and\nfurniture assembly). Thus, existing general approaches cannot locate the target\nactivity effectually. More specifically, existing proposal generation modules\nare not yet fully developed in providing semantic cues for the more\nfine-grained make-up semantic comprehension. To tackle this issue, we propose\nan effective proposal-based framework named Dual-Path Temporal Map Optimization\nNetwork (DPTMO) to capture fine-grained multimodal semantic details of make-up\nactivities. DPTMO extracts both query-agnostic and query-guided features to\nconstruct two proposal sets and uses specific evaluation methods for the two\nsets. Different from the commonly used single structure in previous methods,\nour dual-path structure can mine more semantic information in make-up videos\nand distinguish fine-grained actions well. These two candidate sets represent\nthe cross-modal makeup video-text similarity and multi-modal fusion\nrelationship, complementing each other. Each set corresponds to its respective\noptimization perspective, and their joint prediction enhances the accuracy of\nvideo timestamp prediction. Comprehensive experiments on the YouMakeup dataset\ndemonstrate our proposed dual structure excels in fine-grained semantic\ncomprehension.\n","authors":["Jiaxiu Li","Kun Li","Jia Li","Guoliang Chen","Dan Guo","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05950v1","updated":"2023-09-12T04:03:41Z","published":"2023-09-12T04:03:41Z","title":"Language Models as Black-Box Optimizers for Vision-Language Models","summary":" Vision-language models (VLMs) pre-trained on web-scale datasets have\ndemonstrated remarkable capabilities across a variety of vision and multimodal\ntasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box\nsetting, requiring access to model parameters for backpropagation. However,\nmany VLMs rely on proprietary data and are not open-source, which restricts the\nuse of white-box approaches for fine-tuning. Given that popular private large\nlanguage models (LLMs) like ChatGPT still offer a language-based user\ninterface, we aim to develop a novel fine-tuning approach for VLMs through\nnatural language prompts, thereby avoiding the need to access model parameters,\nfeature embeddings, or output logits. In this setup, we propose employing\nchat-based LLMs as black-box optimizers to search for the best text prompt on\nthe illustrative task of few-shot image classification using CLIP.\nSpecifically, we adopt an automatic \"hill-climbing\" procedure that converges on\nan effective prompt by evaluating the accuracy of current prompts and asking\nLLMs to refine them based on textual feedback, all within a conversational\nprocess without human-in-the-loop. In a challenging 1-shot learning setup, our\nsimple approach surpasses the white-box continuous prompting method CoOp by an\naverage of 1.5% across 11 datasets including ImageNet. Our approach also\noutperforms OpenAI's manually crafted prompts and is more efficient than other\nblack-box methods like iterative APE. Additionally, we highlight the advantage\nof conversational feedback incorporating both positive and negative prompts,\nsuggesting that LLMs can utilize the implicit \"gradient\" direction in textual\nfeedback for a more efficient search. Lastly, we find that the text prompts\ngenerated through our strategy are not only more interpretable but also\ntransfer well across different CLIP architectures in a black-box manner.\n","authors":["Samuel Yu","Shihong Liu","Zhiqiu Lin","Deepak Pathak","Deva Ramanan"],"pdf_url":"https://arxiv.org/pdf/2309.05950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00264v2","updated":"2023-09-12T02:40:08Z","published":"2023-08-01T03:54:27Z","title":"Multi-Modality Multi-Loss Fusion Network","summary":" In this work we investigate the optimal selection and fusion of features\nacross multiple modalities and combine these in a neural network to improve\nemotion detection. We compare different fusion methods and examine the impact\nof multi-loss training within the multi-modality fusion network, identifying\nuseful findings relating to subnet performance. Our best model achieves\nstate-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and\nCH-SIMS), and outperforms the other methods in most metrics. We have found that\ntraining on multimodal features improves single modality testing and designing\nfusion methods based on dataset annotation schema enhances model performance.\nThese results suggest a roadmap towards an optimized feature selection and\nfusion approach for enhancing emotion detection in neural networks.\n","authors":["Zehui Wu","Ziwei Gong","Jaywon Koo","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2308.00264v2.pdf","comment":"First two authors contributed equally to the paper"},{"id":"http://arxiv.org/abs/2304.01347v4","updated":"2023-09-12T01:20:11Z","published":"2023-03-31T02:54:01Z","title":"Temporal Dynamic Synchronous Functional Brain Network for Schizophrenia\n Diagnosis and Lateralization Analysis","summary":" The available evidence suggests that dynamic functional connectivity (dFC)\ncan capture time-varying abnormalities in brain activity in resting-state\ncerebral functional magnetic resonance imaging (rs-fMRI) data and has a natural\nadvantage in uncovering mechanisms of abnormal brain activity in\nschizophrenia(SZ) patients. Hence, an advanced dynamic brain network analysis\nmodel called the temporal brain category graph convolutional network\n(Temporal-BCGCN) was employed. Firstly, a unique dynamic brain network analysis\nmodule, DSF-BrainNet, was designed to construct dynamic synchronization\nfeatures. Subsequently, a revolutionary graph convolution method, TemporalConv,\nwas proposed, based on the synchronous temporal properties of feature. Finally,\nthe first modular abnormal hemispherical lateralization test tool in deep\nlearning based on rs-fMRI data, named CategoryPool, was proposed. This study\nwas validated on COBRE and UCLA datasets and achieved 83.62% and 89.71% average\naccuracies, respectively, outperforming the baseline model and other\nstate-of-the-art methods. The ablation results also demonstrate the advantages\nof TemporalConv over the traditional edge feature graph convolution approach\nand the improvement of CategoryPool over the classical graph pooling approach.\nInterestingly, this study showed that the lower order perceptual system and\nhigher order network regions in the left hemisphere are more severely\ndysfunctional than in the right hemisphere in SZ and reaffirms the importance\nof the left medial superior frontal gyrus in SZ. Our core code is available at:\nhttps://github.com/swfen/Temporal-BCGCN.\n","authors":["Cheng Zhu","Ying Tan","Shuqi Yang","Jiaqing Miao","Jiayi Zhu","Huan Huang","Dezhong Yao","Cheng Luo"],"pdf_url":"https://arxiv.org/pdf/2304.01347v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01516v2","updated":"2023-09-12T20:16:04Z","published":"2023-09-04T10:48:29Z","title":"MultiWay-Adapater: Adapting large-scale multi-modal models for scalable\n image-text retrieval","summary":" As the size of Large Multi-Modal Models (LMMs) increases consistently, the\nadaptation of these pre-trained models to specialized tasks has become a\ncomputationally and memory-intensive challenge. Traditional fine-tuning methods\nrequire isolated, exhaustive retuning for each new task, limiting the models'\nversatility. Moreover, current efficient adaptation techniques often overlook\nmodality alignment, focusing only on the knowledge extraction of new tasks. To\ntackle these issues, we introduce Multiway-Adapter, an innovative framework\nincorporating an 'Alignment Enhancer' to deepen modality alignment, enabling\nhigh transferability without tuning pre-trained parameters. Our method adds\nfewer than 1.25\\% of additional parameters to LMMs, exemplified by the BEiT-3\nmodel in our study. This leads to superior zero-shot image-text retrieval\nperformance compared to fully fine-tuned models, while achieving up to a 57\\%\nreduction in fine-tuning time. Our approach offers a resource-efficient and\neffective adaptation pathway for LMMs, broadening their applicability. The\nsource code is publicly available at:\n\\url{https://github.com/longkukuhi/MultiWay-Adapter}.\n","authors":["Zijun Long","George Killick","Richard McCreadie","Gerardo Aragon Camarasa"],"pdf_url":"https://arxiv.org/pdf/2309.01516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06511v1","updated":"2023-09-12T18:37:05Z","published":"2023-09-12T18:37:05Z","title":"DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio\n Cross-Attention and Facial Self-Attention","summary":" With the rise in manipulated media, deepfake detection has become an\nimperative task for preserving the authenticity of digital content. In this\npaper, we present a novel multi-modal audio-video framework designed to\nconcurrently process audio and video inputs for deepfake detection tasks. Our\nmodel capitalizes on lip synchronization with input audio through a\ncross-attention mechanism while extracting visual cues via a fine-tuned VGG-16\nnetwork. Subsequently, a transformer encoder network is employed to perform\nfacial self-attention. We conduct multiple ablation studies highlighting\ndifferent strengths of our approach. Our multi-modal methodology outperforms\nstate-of-the-art multi-modal deepfake detection techniques in terms of F-1 and\nper-video AUC scores.\n","authors":["Aaditya Kharel","Manas Paranjape","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2309.06511v1.pdf","comment":null}]},"2023-09-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.07124v1","updated":"2023-09-13T17:59:09Z","published":"2023-09-13T17:59:09Z","title":"RAIN: Your Language Models Can Align Themselves without Finetuning","summary":" Large language models (LLMs) often demonstrate inconsistencies with human\npreferences. Previous research gathered human preference data and then aligned\nthe pre-trained models using reinforcement learning or instruction tuning, the\nso-called finetuning step. In contrast, aligning frozen LLMs without any extra\ndata is more appealing. This work explores the potential of the latter setting.\nWe discover that by integrating self-evaluation and rewind mechanisms,\nunaligned LLMs can directly produce responses consistent with human preferences\nvia self-boosting. We introduce a novel inference method, Rewindable\nAuto-regressive INference (RAIN), that allows pre-trained LLMs to evaluate\ntheir own generation and use the evaluation results to guide backward rewind\nand forward generation for AI safety. Notably, RAIN operates without the need\nof extra data for model alignment and abstains from any training, gradient\ncomputation, or parameter updates; during the self-evaluation phase, the model\nreceives guidance on which human preference to align with through a\nfixed-template prompt, eliminating the need to modify the initial prompt.\nExperimental results evaluated by GPT-4 and humans demonstrate the\neffectiveness of RAIN: on the HH dataset, RAIN improves the harmlessness rate\nof LLaMA 30B over vanilla inference from 82% to 97%, while maintaining the\nhelpfulness rate. Under the leading adversarial attack llm-attacks on Vicuna\n33B, RAIN establishes a new defense baseline by reducing the attack success\nrate from 94% to 19%.\n","authors":["Yuhui Li","Fangyun Wei","Jinjing Zhao","Chao Zhang","Hongyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07120v1","updated":"2023-09-13T17:57:21Z","published":"2023-09-13T17:57:21Z","title":"Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness\n and Ethics","summary":" Multi-modal large language models (MLLMs) are trained based on large language\nmodels (LLM), with an enhanced capability to comprehend multi-modal inputs and\ngenerate textual responses. While they excel in multi-modal tasks, the pure NLP\nabilities of MLLMs are often underestimated and left untested. In this study,\nwe get out of the box and unveil an intriguing characteristic of MLLMs -- our\npreliminary results suggest that visual instruction tuning, a prevailing\nstrategy for transitioning LLMs into MLLMs, unexpectedly and interestingly\nhelps models attain both improved truthfulness and ethical alignment in the\npure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model\nsurpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one\nmillion human annotations, on TruthfulQA-mc and Ethics benchmarks. Further\nanalysis reveals that the improved alignment can be attributed to the superior\ninstruction quality inherent to visual-text data. In releasing our code at\ngithub.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration\ninto the intrinsic value of visual-text synergies and, in a broader scope,\nmulti-modal interactions in alignment research.\n","authors":["Haoqin Tu","Bingchen Zhao","Chen Wei","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2309.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07098v1","updated":"2023-09-13T17:15:27Z","published":"2023-09-13T17:15:27Z","title":"Mitigating Hallucinations and Off-target Machine Translation with\n Source-Contrastive and Language-Contrastive Decoding","summary":" Hallucinations and off-target translation remain unsolved problems in machine\ntranslation, especially for low-resource languages and massively multilingual\nmodels. In this paper, we introduce methods to mitigate both failure cases with\na modified decoding objective, without requiring retraining or external models.\nIn source-contrastive decoding, we search for a translation that is probable\ngiven the correct input, but improbable given a random input segment,\nhypothesising that hallucinations will be similarly probable given either. In\nlanguage-contrastive decoding, we search for a translation that is probable,\nbut improbable given the wrong language indicator token. In experiments on\nM2M-100 (418M) and SMaLL-100, we find that these methods effectively suppress\nhallucinations and off-target translations, improving chrF2 by 1.7 and 1.4\npoints on average across 57 tested translation directions. In a proof of\nconcept on English--German, we also show that we can suppress off-target\ntranslations with the Llama 2 chat models, demonstrating the applicability of\nthe method to machine translation with LLMs. We release our source code at\nhttps://github.com/ZurichNLP/ContraDecode.\n","authors":["Rico Sennrich","Jannis Vamvas","Alireza Mohammadshahi"],"pdf_url":"https://arxiv.org/pdf/2309.07098v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05519v2","updated":"2023-09-13T16:49:34Z","published":"2023-09-11T15:02:25Z","title":"NExT-GPT: Any-to-Any Multimodal LLM","summary":" While recently Multimodal Large Language Models (MM-LLMs) have made exciting\nstrides, they mostly fall prey to the limitation of only input-side multimodal\nunderstanding, without the ability to produce content in multiple modalities.\nAs we humans always perceive the world and communicate with people through\nvarious modalities, developing any-to-any MM-LLMs capable of accepting and\ndelivering content in any modality becomes essential to human-level AI. To fill\nthe gap, we present an end-to-end general-purpose any-to-any MM-LLM system,\nNExT-GPT. We connect an LLM with multimodal adaptors and different diffusion\ndecoders, enabling NExT-GPT to perceive inputs and generate outputs in\narbitrary combinations of text, images, videos, and audio. By leveraging the\nexisting well-trained highly-performing encoders and decoders, NExT-GPT is\ntuned with only a small amount of parameter (1%) of certain projection layers,\nwhich not only benefits low-cost training and also facilitates convenient\nexpansion to more potential modalities. Moreover, we introduce a\nmodality-switching instruction tuning (MosIT) and manually curate a\nhigh-quality dataset for MosIT, based on which NExT-GPT is empowered with\ncomplex cross-modal semantic understanding and content generation. Overall, our\nresearch showcases the promising possibility of building an AI agent capable of\nmodeling universal modalities, paving the way for more human-like AI research\nin the community. Project page: https://next-gpt.github.io/\n","authors":["Shengqiong Wu","Hao Fei","Leigang Qu","Wei Ji","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.05519v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2309.07081v1","updated":"2023-09-13T16:46:27Z","published":"2023-09-13T16:46:27Z","title":"Can Whisper perform speech-based in-context learning","summary":" This paper investigates the in-context learning abilities of the Whisper\nautomatic speech recognition (ASR) models released by OpenAI. A novel\nspeech-based in-context learning (SICL) approach is proposed for test-time\nadaptation, which can reduce the word error rates (WERs) with only a small\nnumber of labelled speech samples without gradient descent. Language-level\nadaptation experiments using Chinese dialects showed that when applying SICL to\nisolated word ASR, consistent and considerable relative WER reductions can be\nachieved using Whisper models of any size on two dialects, which is on average\n32.3%. A k-nearest-neighbours-based in-context example selection technique can\nbe applied to further improve the efficiency of SICL, which can increase the\naverage relative WER reduction to 36.4%. The findings are verified using\nspeaker adaptation or continuous speech recognition tasks, and both achieved\nconsiderable relative WER reductions. Detailed quantitative analyses are also\nprovided to shed light on SICL's adaptability to phonological variances and\ndialect-specific lexical nuances.\n","authors":["Siyin Wang","Chao-Han Huck Yang","Ji Wu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07081v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2304.11075v2","updated":"2023-09-13T16:12:56Z","published":"2023-04-20T14:42:54Z","title":"Spaiche: Extending State-of-the-Art ASR Models to Swiss German Dialects","summary":" Recent breakthroughs in NLP largely increased the presence of ASR systems in\nour daily lives. However, for many low-resource languages, ASR models still\nneed to be improved due in part to the difficulty of acquiring pertinent data.\nThis project aims to help advance research in ASR models for Swiss German\ndialects, by providing insights about the performance of state-of-the-art ASR\nmodels on recently published Swiss German speech datasets. We propose a novel\nloss that takes into account the semantic distance between the predicted and\nthe ground-truth labels. We outperform current state-of-the-art results by\nfine-tuning OpenAI's Whisper model on Swiss-German datasets.\n","authors":["Clement Sicard","Kajetan Pyszkowski","Victor Gillioz"],"pdf_url":"https://arxiv.org/pdf/2304.11075v2.pdf","comment":"8 pages, SwissText conference"},{"id":"http://arxiv.org/abs/2309.07045v1","updated":"2023-09-13T15:56:50Z","published":"2023-09-13T15:56:50Z","title":"SafetyBench: Evaluating the Safety of Large Language Models with\n Multiple Choice Questions","summary":" With the rapid development of Large Language Models (LLMs), increasing\nattention has been paid to their safety concerns. Consequently, evaluating the\nsafety of LLMs has become an essential task for facilitating the broad\napplications of LLMs. Nevertheless, the absence of comprehensive safety\nevaluation benchmarks poses a significant impediment to effectively assess and\nenhance the safety of LLMs. In this work, we present SafetyBench, a\ncomprehensive benchmark for evaluating the safety of LLMs, which comprises\n11,435 diverse multiple choice questions spanning across 7 distinct categories\nof safety concerns. Notably, SafetyBench also incorporates both Chinese and\nEnglish data, facilitating the evaluation in both languages. Our extensive\ntests over 25 popular Chinese and English LLMs in both zero-shot and few-shot\nsettings reveal a substantial performance advantage for GPT-4 over its\ncounterparts, and there is still significant room for improving the safety of\ncurrent LLMs. We believe SafetyBench will enable fast and comprehensive\nevaluation of LLMs' safety, and foster the development of safer LLMs. Data and\nevaluation guidelines are available at https://github.com/thu-coai/SafetyBench.\nSubmission entrance and leaderboard are available at\nhttps://llmbench.ai/safety.\n","authors":["Zhexin Zhang","Leqi Lei","Lindong Wu","Rui Sun","Yongkang Huang","Chong Long","Xiao Liu","Xuanyu Lei","Jie Tang","Minlie Huang"],"pdf_url":"https://arxiv.org/pdf/2309.07045v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2309.05918v2","updated":"2023-09-13T15:45:18Z","published":"2023-09-12T02:14:05Z","title":"Stochastic LLMs do not Understand Language: Towards Symbolic,\n Explainable and Ontologically Based LLMs","summary":" In our opinion the exuberance surrounding the relative success of data-driven\nlarge language models (LLMs) is slightly misguided and for several reasons (i)\nLLMs cannot be relied upon for factual information since for LLMs all ingested\ntext (factual or non-factual) was created equal; (ii) due to their subsymbolic\nna-ture, whatever 'knowledge' these models acquire about language will always\nbe buried in billions of microfeatures (weights), none of which is meaningful\non its own; and (iii) LLMs will often fail to make the correct inferences in\nseveral linguistic contexts (e.g., nominal compounds, copredication, quantifier\nscope ambi-guities, intensional contexts. Since we believe the relative success\nof data-driven large language models (LLMs) is not a reflection on the symbolic\nvs. subsymbol-ic debate but a reflection on applying the successful strategy of\na bottom-up reverse engineering of language at scale, we suggest in this paper\napplying the effective bottom-up strategy in a symbolic setting resulting in\nsymbolic, explainable, and ontologically grounded language models.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2309.05918v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2309.07034v1","updated":"2023-09-13T15:42:06Z","published":"2023-09-13T15:42:06Z","title":"How (Not) to Use Sociodemographic Information for Subjective NLP Tasks","summary":" Annotators' sociodemographic backgrounds (i.e., the individual compositions\nof their gender, age, educational background, etc.) have a strong impact on\ntheir decisions when working on subjective NLP tasks, such as hate speech\ndetection. Often, heterogeneous backgrounds result in high disagreements. To\nmodel this variation, recent work has explored sociodemographic prompting, a\ntechnique, which steers the output of prompt-based models towards answers that\nhumans with specific sociodemographic profiles would give. However, the\navailable NLP literature disagrees on the efficacy of this technique -- it\nremains unclear, for which tasks and scenarios it can help and evaluations are\nlimited to specific tasks only. We address this research gap by presenting the\nlargest and most comprehensive study of sociodemographic prompting today.\nConcretely, we evaluate several prompt formulations across seven datasets and\nsix instruction-tuned model families. We find that (1) while sociodemographic\nprompting can be beneficial for improving zero-shot learning in subjective NLP\ntasks, (2) its outcomes largely vary for different model types, sizes, and\ndatasets, (3) are subject to large variance with regards to prompt\nformulations. Thus, sociodemographic prompting is not a reliable proxy for\ntraditional data annotation with a sociodemographically heterogeneous group of\nannotators. Instead, we propose (4) to use it for identifying ambiguous\ninstances resulting in more informed annotation efforts.\n","authors":["Tilman Beck","Hendrik Schuff","Anne Lauscher","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2309.07034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07020v1","updated":"2023-09-13T15:23:30Z","published":"2023-09-13T15:23:30Z","title":"Beyond original Research Articles Categorization via NLP","summary":" This work proposes a novel approach to text categorization -- for unknown\ncategories -- in the context of scientific literature, using Natural Language\nProcessing techniques. The study leverages the power of pre-trained language\nmodels, specifically SciBERT, to extract meaningful representations of\nabstracts from the ArXiv dataset. Text categorization is performed using the\nK-Means algorithm, and the optimal number of clusters is determined based on\nthe Silhouette score. The results demonstrate that the proposed approach\ncaptures subject information more effectively than the traditional arXiv\nlabeling system, leading to improved text categorization. The approach offers\npotential for better navigation and recommendation systems in the rapidly\ngrowing landscape of scientific research literature.\n","authors":["Rosanna Turrisi"],"pdf_url":"https://arxiv.org/pdf/2309.07020v1.pdf","comment":"Workshop on Human-in-the-Loop Applied Machine Learning (HITLAML),\n 2023"},{"id":"http://arxiv.org/abs/2308.09435v2","updated":"2023-09-13T15:22:29Z","published":"2023-08-18T10:07:28Z","title":"A Methodology for Generative Spelling Correction via Natural Spelling\n Errors Emulation across Multiple Domains and Languages","summary":" Modern large language models demonstrate impressive capabilities in text\ngeneration and generalization. However, they often struggle with solving text\nediting tasks, particularly when it comes to correcting spelling errors and\nmistypings. In this paper, we present a methodology for generative spelling\ncorrection (SC), which was tested on English and Russian languages and\npotentially can be extended to any language with minor changes. Our research\nmainly focuses on exploring natural spelling errors and mistypings in texts and\nstudying the ways those errors can be emulated in correct sentences to\neffectively enrich generative models' pre-train procedure. We investigate the\nimpact of such emulations and the models' abilities across different text\ndomains. In this work, we investigate two spelling corruption techniques: 1)\nfirst one mimics human behavior when making a mistake through leveraging\nstatistics of errors from particular dataset and 2) second adds the most common\nspelling errors, keyboard miss clicks, and some heuristics within the texts. We\nconducted experiments employing various corruption strategies, models'\narchitectures and sizes on the pre-training and fine-tuning stages and\nevaluated the models using single-domain and multi-domain test sets. As a\npractical outcome of our work, we introduce SAGE(Spell checking via\nAugmentation and Generative distribution Emulation). It is a library for\nautomatic generative SC that includes a family of pre-trained generative models\nand built-in augmentation algorithms.\n","authors":["Nikita Martynov","Mark Baushenko","Anastasia Kozlova","Katerina Kolomeytseva","Aleksandr Abramov","Alena Fenogenova"],"pdf_url":"https://arxiv.org/pdf/2308.09435v2.pdf","comment":"to appear in EACL 2024"},{"id":"http://arxiv.org/abs/2309.07015v1","updated":"2023-09-13T15:17:29Z","published":"2023-09-13T15:17:29Z","title":"Résumé Parsing as Hierarchical Sequence Labeling: An Empirical Study","summary":" Extracting information from r\\'esum\\'es is typically formulated as a\ntwo-stage problem, where the document is first segmented into sections and then\neach section is processed individually to extract the target entities. Instead,\nwe cast the whole problem as sequence labeling in two levels -- lines and\ntokens -- and study model architectures for solving both tasks simultaneously.\nWe build high-quality r\\'esum\\'e parsing corpora in English, French, Chinese,\nSpanish, German, Portuguese, and Swedish. Based on these corpora, we present\nexperimental results that demonstrate the effectiveness of the proposed models\nfor the information extraction task, outperforming approaches introduced in\nprevious work. We conduct an ablation study of the proposed architectures. We\nalso analyze both model performance and resource efficiency, and describe the\ntrade-offs for model deployment in the context of a production environment.\n","authors":["Federico Retyk","Hermenegildo Fabregat","Juan Aizpuru","Mariana Taglio","Rabih Zbib"],"pdf_url":"https://arxiv.org/pdf/2309.07015v1.pdf","comment":"RecSys in HR'23: The 3rd Workshop on Recommender Systems for Human\n Resources, in conjunction with the 17th ACM Conference on Recommender\n Systems, September 18--22, 2023, Singapore, Singapore"},{"id":"http://arxiv.org/abs/2212.01378v2","updated":"2023-09-13T15:07:01Z","published":"2022-12-02T18:59:04Z","title":"ColD Fusion: Collaborative Descent for Distributed Multitask Finetuning","summary":" We propose a new paradigm to continually evolve pretrained models, denoted\nColD Fusion. It provides the benefits of multitask learning but leverages\ndistributed computation with limited communication and eliminates the need for\nshared data. Consequentially, ColD Fusion can give rise to a synergistic loop,\nwhere finetuned models can be recycled to continually improve the pretrained\nmodel they are based upon. We show that ColD Fusion yields comparable benefits\nto multitask training by producing a model that (a) attains strong performance\non all of the datasets it was trained on; and (b) is a better starting point\nfor finetuning on unseen datasets. We show that ColD Fusion outperforms RoBERTa\nand even previous multitask models. Specifically, when training and testing on\n35 diverse datasets, ColD Fusion-based model outperforms RoBERTa by 2.33 points\non average without any changes to the architecture.\n","authors":["Shachar Don-Yehiya","Elad Venezian","Colin Raffel","Noam Slonim","Yoav Katz","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2212.01378v2.pdf","comment":"ACL 23"},{"id":"http://arxiv.org/abs/2309.07009v1","updated":"2023-09-13T15:00:56Z","published":"2023-09-13T15:00:56Z","title":"OYXOY: A Modern NLP Test Suite for Modern Greek","summary":" This paper serves as a foundational step towards the development of a\nlinguistically motivated and technically relevant evaluation suite for Greek\nNLP. We initiate this endeavor by introducing four expert-verified evaluation\ntasks, specifically targeted at natural language inference, word sense\ndisambiguation (through example comparison or sense selection) and metaphor\ndetection. More than language-adapted replicas of existing tasks, we contribute\ntwo innovations which will resonate with the broader resource and evaluation\ncommunity. Firstly, our inference dataset is the first of its kind, marking not\njust \\textit{one}, but rather \\textit{all} possible inference labels,\naccounting for possible shifts due to e.g. ambiguity or polysemy. Secondly, we\ndemonstrate a cost-efficient method to obtain datasets for under-resourced\nlanguages. Using ChatGPT as a language-neutral parser, we transform the\nDictionary of Standard Modern Greek into a structured format, from which we\nderive the other three tasks through simple projections. Alongside each task,\nwe conduct experiments using currently available state of the art machinery.\nOur experimental baselines affirm the challenging nature of our tasks and\nhighlight the need for expedited progress in order for the Greek NLP ecosystem\nto keep pace with contemporary mainstream research.\n","authors":["Konstantinos Kogkalidis","Stergios Chatzikyriakidis","Eirini Chrysovalantou Giannikouri","Vassiliki Katsouli","Christina Klironomou","Christina Koula","Dimitris Papadakis","Thelka Pasparaki","Erofili Psaltaki","Efthymia Sakellariou","Hara Soupiona"],"pdf_url":"https://arxiv.org/pdf/2309.07009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02982v3","updated":"2023-09-13T14:49:23Z","published":"2022-11-05T22:06:50Z","title":"Event and Entity Extraction from Generated Video Captions","summary":" Annotation of multimedia data by humans is time-consuming and costly, while\nreliable automatic generation of semantic metadata is a major challenge. We\npropose a framework to extract semantic metadata from automatically generated\nvideo captions. As metadata, we consider entities, the entities' properties,\nrelations between entities, and the video category. We employ two\nstate-of-the-art dense video captioning models with masked transformer (MT) and\nparallel decoding (PVDC) to generate captions for videos of the ActivityNet\nCaptions dataset. Our experiments show that it is possible to extract entities,\ntheir properties, relations between entities, and the video category from the\ngenerated captions. We observe that the quality of the extracted information is\nmainly influenced by the quality of the event localization in the video as well\nas the performance of the event caption generation.\n","authors":["Johannes Scherer","Ansgar Scherp","Deepayan Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2211.02982v3.pdf","comment":"Paper accepted at CD-MAKE 2023"},{"id":"http://arxiv.org/abs/2308.00802v2","updated":"2023-09-13T14:43:45Z","published":"2023-08-01T19:34:18Z","title":"GRDD: A Dataset for Greek Dialectal NLP","summary":" In this paper, we present a dataset for the computational study of a number\nof Modern Greek dialects. It consists of raw text data from four dialects of\nModern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is\nof considerable size, albeit imbalanced, and presents the first attempt to\ncreate large scale dialectal resources of this type for Modern Greek dialects.\nWe then use the dataset to perform dialect idefntification. We experiment with\ntraditional ML algorithms, as well as simple DL architectures. The results show\nvery good performance on the task, potentially revealing that the dialects in\nquestion have distinct enough characteristics allowing even simple ML models to\nperform well on the task. Error analysis is performed for the top performing\nalgorithms showing that in a number of cases the errors are due to insufficient\ndataset cleaning.\n","authors":["Stergios Chatzikyriakidis","Chatrine Qwaider","Ilias Kolokousis","Christina Koula","Dimitris Papadakis","Efthymia Sakellariou"],"pdf_url":"https://arxiv.org/pdf/2308.00802v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06991v1","updated":"2023-09-13T14:36:26Z","published":"2023-09-13T14:36:26Z","title":"Unsupervised Contrast-Consistent Ranking with Language Models","summary":" Language models contain ranking-based knowledge and are powerful solvers of\nin-context ranking tasks. For instance, they may have parametric knowledge\nabout the ordering of countries by size or may be able to rank reviews by\nsentiment. Recent work focuses on pairwise, pointwise, and listwise prompting\ntechniques to elicit a language model's ranking knowledge. However, we find\nthat even with careful calibration and constrained decoding, prompting-based\ntechniques may not always be self-consistent in the rankings they produce. This\nmotivates us to explore an alternative approach that is inspired by an\nunsupervised probing method called Contrast-Consistent Search (CCS). The idea\nis to train a probing model guided by a logical constraint: a model's\nrepresentation of a statement and its negation must be mapped to contrastive\ntrue-false poles consistently across multiple statements. We hypothesize that\nsimilar constraints apply to ranking tasks where all items are related via\nconsistent pairwise or listwise comparisons. To this end, we extend the binary\nCCS method to Contrast-Consistent Ranking (CCR) by adapting existing ranking\nmethods such as the Max-Margin Loss, Triplet Loss, and Ordinal Regression\nobjective. Our results confirm that, for the same language model, CCR probing\noutperforms prompting and even performs on a par with prompting much larger\nlanguage models.\n","authors":["Niklas Stoehr","Pengxiang Cheng","Jing Wang","Daniel Preotiuc-Pietro","Rajarshi Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2309.06991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06989v1","updated":"2023-09-13T14:30:30Z","published":"2023-09-13T14:30:30Z","title":"Remote Inference of Cognitive Scores in ALS Patients Using a Picture\n Description","summary":" Amyotrophic lateral sclerosis is a fatal disease that not only affects\nmovement, speech, and breath but also cognition. Recent studies have focused on\nthe use of language analysis techniques to detect ALS and infer scales for\nmonitoring functional progression. In this paper, we focused on another\nimportant aspect, cognitive impairment, which affects 35-50% of the ALS\npopulation. In an effort to reach the ALS population, which frequently exhibits\nmobility limitations, we implemented the digital version of the Edinburgh\nCognitive and Behavioral ALS Screen (ECAS) test for the first time. This test\nwhich is designed to measure cognitive impairment was remotely performed by 56\nparticipants from the EverythingALS Speech Study. As part of the study,\nparticipants (ALS and non-ALS) were asked to describe weekly one picture from a\npool of many pictures with complex scenes displayed on their computer at home.\nWe analyze the descriptions performed within +/- 60 days from the day the ECAS\ntest was administered and extract different types of linguistic and acoustic\nfeatures. We input those features into linear regression models to infer 5 ECAS\nsub-scores and the total score. Speech samples from the picture description are\nreliable enough to predict the ECAS subs-scores, achieving statistically\nsignificant Spearman correlation values between 0.32 and 0.51 for the model's\nperformance using 10-fold cross-validation.\n","authors":["Carla Agurto","Guillermo Cecchi","Bo Wen","Ernest Fraenkel","James Berry","Indu Navar","Raquel Norel"],"pdf_url":"https://arxiv.org/pdf/2309.06989v1.pdf","comment":"conference paper"},{"id":"http://arxiv.org/abs/2309.06979v1","updated":"2023-09-13T14:15:03Z","published":"2023-09-13T14:15:03Z","title":"Auto-Regressive Next-Token Predictors are Universal Learners","summary":" Large language models display remarkable capabilities in logical and\nmathematical reasoning, allowing them to solve complex tasks. Interestingly,\nthese abilities emerge in networks trained on the simple task of next-token\nprediction. In this work, we present a theoretical framework for studying\nauto-regressive next-token predictors. We demonstrate that even simple models\nsuch as linear next-token predictors, trained on Chain-of-Thought (CoT) data,\ncan approximate any function efficiently computed by a Turing machine. We\nintroduce a new complexity measure -- length complexity -- which measures the\nnumber of intermediate tokens in a CoT sequence required to approximate some\ntarget function, and analyze the interplay between length complexity and other\nnotions of complexity. Finally, we show experimentally that simple next-token\npredictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs),\ndisplay non-trivial performance on text generation and arithmetic tasks. Our\nresults demonstrate that the power of language models can be attributed, to a\ngreat extent, to the auto-regressive next-token training scheme, and not\nnecessarily to a particular choice of architecture.\n","authors":["Eran Malach"],"pdf_url":"https://arxiv.org/pdf/2309.06979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18248v2","updated":"2023-09-13T13:58:36Z","published":"2023-05-29T17:12:03Z","title":"Do Language Models Know When They're Hallucinating References?","summary":" State-of-the-art language models (LMs) are famous for \"hallucinating\"\nreferences. These fabricated article and book titles lead to harms, obstacles\nto their use, and public backlash. While other types of LM hallucinations are\nalso important, we propose hallucinated references as the \"drosophila\" of\nresearch on hallucination in large language models (LLMs), as they are\nparticularly easy to study. We show that simple search engine queries reliably\nidentify such hallucinations, which facilitates evaluation. To begin to dissect\nthe nature of hallucinated LM references, we attempt to classify them using\nblack-box queries to the same LM, without consulting any external resources.\nConsistency checks done with \"direct\" queries about whether the generated\nreference title is real (inspired by Kadavath et al. 2022, Lin et al. 2022,\nManakul et al. 2023) are compared to consistency checks with \"indirect\" queries\nwhich ask for ancillary details such as the authors of the work. These\nconsistency checks are found to be partially reliable indicators of whether or\nnot the reference is a hallucination. In particular, we find that LMs often\nhallucinate differing authors of hallucinated references when queried in\nindependent sessions, while consistently identify authors of real references.\nThis suggests that the hallucination may be more a generation issue than\ninherent to current training techniques or representation.\n","authors":["Ayush Agrawal","Mirac Suzgun","Lester Mackey","Adam Tauman Kalai"],"pdf_url":"https://arxiv.org/pdf/2305.18248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01538v2","updated":"2023-09-13T13:00:23Z","published":"2023-09-04T11:38:02Z","title":"ChatRule: Mining Logical Rules with Large Language Models for Knowledge\n Graph Reasoning","summary":" Logical rules are essential for uncovering the logical connections between\nrelations, which could improve the reasoning performance and provide\ninterpretable results on knowledge graphs (KGs). Although there have been many\nefforts to mine meaningful logical rules over KGs, existing methods suffer from\nthe computationally intensive searches over the rule space and a lack of\nscalability for large-scale KGs. Besides, they often ignore the semantics of\nrelations which is crucial for uncovering logical connections. Recently, large\nlanguage models (LLMs) have shown impressive performance in the field of\nnatural language processing and various applications, owing to their emergent\nability and generalizability. In this paper, we propose a novel framework,\nChatRule, unleashing the power of large language models for mining logical\nrules over knowledge graphs. Specifically, the framework is initiated with an\nLLM-based rule generator, leveraging both the semantic and structural\ninformation of KGs to prompt LLMs to generate logical rules. To refine the\ngenerated rules, a rule ranking module estimates the rule quality by\nincorporating facts from existing KGs. Last, a rule validator harnesses the\nreasoning ability of LLMs to validate the logical correctness of ranked rules\nthrough chain-of-thought reasoning. ChatRule is evaluated on four large-scale\nKGs, w.r.t. different rule quality metrics and downstream tasks, showing the\neffectiveness and scalability of our method.\n","authors":["Linhao Luo","Jiaxin Ju","Bo Xiong","Yuan-Fang Li","Gholamreza Haffari","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2309.01538v2.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.06928v1","updated":"2023-09-13T12:58:09Z","published":"2023-09-13T12:58:09Z","title":"Dynamic Causal Disentanglement Model for Dialogue Emotion Detection","summary":" Emotion detection is a critical technology extensively employed in diverse\nfields. While the incorporation of commonsense knowledge has proven beneficial\nfor existing emotion detection methods, dialogue-based emotion detection\nencounters numerous difficulties and challenges due to human agency and the\nvariability of dialogue content.In dialogues, human emotions tend to accumulate\nin bursts. However, they are often implicitly expressed. This implies that many\ngenuine emotions remain concealed within a plethora of unrelated words and\ndialogues.In this paper, we propose a Dynamic Causal Disentanglement Model\nbased on hidden variable separation, which is founded on the separation of\nhidden variables. This model effectively decomposes the content of dialogues\nand investigates the temporal accumulation of emotions, thereby enabling more\nprecise emotion recognition. First, we introduce a novel Causal Directed\nAcyclic Graph (DAG) to establish the correlation between hidden emotional\ninformation and other observed elements. Subsequently, our approach utilizes\npre-extracted personal attributes and utterance topics as guiding factors for\nthe distribution of hidden variables, aiming to separate irrelevant ones.\nSpecifically, we propose a dynamic temporal disentanglement model to infer the\npropagation of utterances and hidden variables, enabling the accumulation of\nemotion-related information throughout the conversation. To guide this\ndisentanglement process, we leverage the ChatGPT-4.0 and LSTM networks to\nextract utterance topics and personal attributes as observed\ninformation.Finally, we test our approach on two popular datasets in dialogue\nemotion detection and relevant experimental results verified the model's\nsuperiority.\n","authors":["Yuting Su","Yichen Wei","Weizhi Nie","Sicheng Zhao","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06923v1","updated":"2023-09-13T12:47:40Z","published":"2023-09-13T12:47:40Z","title":"Native Language Identification with Big Bird Embeddings","summary":" Native Language Identification (NLI) intends to classify an author's native\nlanguage based on their writing in another language. Historically, the task has\nheavily relied on time-consuming linguistic feature engineering, and\ntransformer-based NLI models have thus far failed to offer effective, practical\nalternatives. The current work investigates if input size is a limiting factor,\nand shows that classifiers trained using Big Bird embeddings outperform\nlinguistic feature engineering models by a large margin on the Reddit-L2\ndataset. Additionally, we provide further insight into input length\ndependencies, show consistent out-of-sample performance, and qualitatively\nanalyze the embedding space. Given the effectiveness and computational\nefficiency of this method, we believe it offers a promising avenue for future\nNLI work.\n","authors":["Sergey Kramp","Giovanni Cassani","Chris Emmery"],"pdf_url":"https://arxiv.org/pdf/2309.06923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06917v1","updated":"2023-09-13T12:30:03Z","published":"2023-09-13T12:30:03Z","title":"Continual Learning with Dirichlet Generative-based Rehearsal","summary":" Recent advancements in data-driven task-oriented dialogue systems (ToDs)\nstruggle with incremental learning due to computational constraints and\ntime-consuming issues. Continual Learning (CL) attempts to solve this by\navoiding intensive pre-training, but it faces the problem of catastrophic\nforgetting (CF). While generative-based rehearsal CL methods have made\nsignificant strides, generating pseudo samples that accurately reflect the\nunderlying task-specific distribution is still a challenge. In this paper, we\npresent Dirichlet Continual Learning (DCL), a novel generative-based rehearsal\nstrategy for CL. Unlike the traditionally used Gaussian latent variable in the\nConditional Variational Autoencoder (CVAE), DCL leverages the flexibility and\nversatility of the Dirichlet distribution to model the latent prior variable.\nThis enables it to efficiently capture sentence-level features of previous\ntasks and effectively guide the generation of pseudo samples. In addition, we\nintroduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based\nknowledge distillation method that enhances knowledge transfer during pseudo\nsample generation. Our experiments confirm the efficacy of our approach in both\nintent detection and slot-filling tasks, outperforming state-of-the-art\nmethods.\n","authors":["Min Zeng","Wei Xue","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2309.06917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06435v3","updated":"2023-09-13T12:13:45Z","published":"2023-07-12T20:01:52Z","title":"A Comprehensive Overview of Large Language Models","summary":" Large Language Models (LLMs) have recently demonstrated remarkable\ncapabilities in natural language processing tasks and beyond. This success of\nLLMs has led to a large influx of research contributions in this direction.\nThese works encompass diverse topics such as architectural innovations of the\nunderlying neural networks, context length improvements, model alignment,\ntraining datasets, benchmarking, efficiency and more. With the rapid\ndevelopment of techniques and regular breakthroughs in LLM research, it has\nbecome considerably challenging to perceive the bigger picture of the advances\nin this direction. Considering the rapidly emerging plethora of literature on\nLLMs, it is imperative that the research community is able to benefit from a\nconcise yet comprehensive overview of the recent developments in this field.\nThis article provides that overview to the research community. It not only\nfocuses on a systematic treatment of the existing literature on a broad range\nof LLM related concept, but also pays special attention to providing\ncomprehensive summaries with extensive details about the individual existing\nmodels, datasets and major insights. We also pay heed to aligning our overview\nwith the emerging outlook of this research direction by accounting for the\nother recently materializing reviews of the broader research direction of LLMs.\nOur self-contained comprehensive overview of LLMs discusses relevant background\nconcepts along with covering the advanced topics at the frontier of this\nresearch direction. This review article is intended to not only provide a\nsystematic survey, but also a quick comprehensive reference for the researchers\nand practitioners to draw insights from extensive informative summaries of the\nexisting works to advance the LLM research direction.\n","authors":["Humza Naveed","Asad Ullah Khan","Shi Qiu","Muhammad Saqib","Saeed Anwar","Muhammad Usman","Naveed Akhtar","Nick Barnes","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.06435v3.pdf","comment":"Work in-progress"},{"id":"http://arxiv.org/abs/2309.06908v1","updated":"2023-09-13T12:10:54Z","published":"2023-09-13T12:10:54Z","title":"Towards the TopMost: A Topic Modeling System Toolkit","summary":" Topic models have been proposed for decades with various applications and\nrecently refreshed by the neural variational inference. However, these topic\nmodels adopt totally distinct dataset, implementation, and evaluation settings,\nwhich hinders their quick utilization and fair comparisons. This greatly\nhinders the research progress of topic models. To address these issues, in this\npaper we propose a Topic Modeling System Toolkit (TopMost). Compared to\nexisting toolkits, TopMost stands out by covering a wider range of topic\nmodeling scenarios including complete lifecycles with dataset pre-processing,\nmodel training, testing, and evaluations. The highly cohesive and decoupled\nmodular design of TopMost enables quick utilization, fair comparisons, and\nflexible extensions of different topic models. This can facilitate the research\nand applications of topic models. Our code, tutorials, and documentation are\navailable at https://github.com/bobxwu/topmost.\n","authors":["Xiaobao Wu","Fengjun Pan","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2309.06908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14020v2","updated":"2023-09-13T11:22:19Z","published":"2023-05-23T12:55:21Z","title":"Does ChatGPT have Theory of Mind?","summary":" Theory of Mind (ToM) is the ability to understand human thinking and\ndecision-making, an ability that plays a crucial role in social interaction\nbetween people, including linguistic communication. This paper investigates to\nwhat extent recent Large Language Models in the ChatGPT tradition possess ToM.\nWe posed six well-known problems that address biases in human reasoning and\ndecision making to two versions of ChatGPT and we compared the results under a\nrange of prompting strategies. While the results concerning ChatGPT-3 were\nsomewhat inconclusive, ChatGPT-4 was shown to arrive at the correct answers\nmore often than would be expected based on chance, although correct answers\nwere often arrived at on the basis of false assumptions or invalid reasoning.\n","authors":["Bart Holterman","Kees van Deemter"],"pdf_url":"https://arxiv.org/pdf/2305.14020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06908v2","updated":"2023-09-13T10:50:02Z","published":"2023-05-11T15:51:46Z","title":"CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency\n Model","summary":" Denoising diffusion probabilistic models (DDPMs) have shown promising\nperformance for speech synthesis. However, a large number of iterative steps\nare required to achieve high sample quality, which restricts the inference\nspeed. Maintaining sample quality while increasing sampling speed has become a\nchallenging task. In this paper, we propose a \"Co\"nsistency \"Mo\"del-based\n\"Speech\" synthesis method, CoMoSpeech, which achieve speech synthesis through a\nsingle diffusion sampling step while achieving high audio quality. The\nconsistency constraint is applied to distill a consistency model from a\nwell-designed diffusion-based teacher model, which ultimately yields superior\nperformances in the distilled CoMoSpeech. Our experiments show that by\ngenerating audio recordings by a single sampling step, the CoMoSpeech achieves\nan inference speed more than 150 times faster than real-time on a single NVIDIA\nA100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based\nspeech synthesis truly practical. Meanwhile, objective and subjective\nevaluations on text-to-speech and singing voice synthesis show that the\nproposed teacher models yield the best audio quality, and the one-step sampling\nbased CoMoSpeech achieves the best inference speed with better or comparable\naudio quality to other conventional multi-step diffusion model baselines. Audio\nsamples are available at https://comospeech.github.io/.\n","authors":["Zhen Ye","Wei Xue","Xu Tan","Jie Chen","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2305.06908v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.06844v1","updated":"2023-09-13T09:49:20Z","published":"2023-09-13T09:49:20Z","title":"Gpachov at CheckThat! 2023: A Diverse Multi-Approach Ensemble for\n Subjectivity Detection in News Articles","summary":" The wide-spread use of social networks has given rise to subjective,\nmisleading, and even false information on the Internet. Thus, subjectivity\ndetection can play an important role in ensuring the objectiveness and the\nquality of a piece of information. This paper presents the solution built by\nthe Gpachov team for the CLEF-2023 CheckThat! lab Task~2 on subjectivity\ndetection. Three different research directions are explored. The first one is\nbased on fine-tuning a sentence embeddings encoder model and dimensionality\nreduction. The second one explores a sample-efficient few-shot learning model.\nThe third one evaluates fine-tuning a multilingual transformer on an altered\ndataset, using data from multiple languages. Finally, the three approaches are\ncombined in a simple majority voting ensemble, resulting in 0.77 macro F1 on\nthe test set and achieving 2nd place on the English subtask.\n","authors":["Georgi Pachov","Dimitar Dimitrov","Ivan Koychev","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2309.06844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09704v2","updated":"2023-09-13T09:43:28Z","published":"2023-06-16T09:15:39Z","title":"Cross-corpus Readability Compatibility Assessment for English Texts","summary":" Text readability assessment has gained significant attention from researchers\nin various domains. However, the lack of exploration into corpus compatibility\nposes a challenge as different research groups utilize different corpora. In\nthis study, we propose a novel evaluation framework, Cross-corpus text\nReadability Compatibility Assessment (CRCA), to address this issue. The\nframework encompasses three key components: (1) Corpus: CEFR, CLEC, CLOTH, NES,\nOSP, and RACE. Linguistic features, GloVe word vector representations, and\ntheir fusion features were extracted. (2) Classification models: Machine\nlearning methods (XGBoost, SVM) and deep learning methods (BiLSTM,\nAttention-BiLSTM) were employed. (3) Compatibility metrics: RJSD, RRNSS, and\nNDCG metrics. Our findings revealed: (1) Validated corpus compatibility, with\nOSP standing out as significantly different from other datasets. (2) An\nadaptation effect among corpora, feature representations, and classification\nmethods. (3) Consistent outcomes across the three metrics, validating the\nrobustness of the compatibility assessment framework. The outcomes of this\nstudy offer valuable insights into corpus selection, feature representation,\nand classification methods, and it can also serve as a beginning effort for\ncross-corpus transfer learning.\n","authors":["Zhenzhen Li","Han Ding","Shaohong Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.09704v2.pdf","comment":"14 pages,17 figures"},{"id":"http://arxiv.org/abs/2309.06814v1","updated":"2023-09-13T09:05:09Z","published":"2023-09-13T09:05:09Z","title":"Comparative Analysis of Contextual Relation Extraction based on Deep\n Learning Models","summary":" Contextual Relation Extraction (CRE) is mainly used for constructing a\nknowledge graph with a help of ontology. It performs various tasks such as\nsemantic search, query answering, and textual entailment. Relation extraction\nidentifies the entities from raw texts and the relations among them. An\nefficient and accurate CRE system is essential for creating domain knowledge in\nthe biomedical industry. Existing Machine Learning and Natural Language\nProcessing (NLP) techniques are not suitable to predict complex relations from\nsentences that consist of more than two relations and unspecified entities\nefficiently. In this work, deep learning techniques have been used to identify\nthe appropriate semantic relation based on the context from multiple sentences.\nEven though various machine learning models have been used for relation\nextraction, they provide better results only for binary relations, i.e.,\nrelations occurred exactly between the two entities in a sentence. Machine\nlearning models are not suited for complex sentences that consist of the words\nthat have various meanings. To address these issues, hybrid deep learning\nmodels have been used to extract the relations from complex sentence\neffectively. This paper explores the analysis of various deep learning models\nthat are used for relation extraction.\n","authors":["R. Priyadharshini","G. Jeyakodi","P. Shanthi Bala"],"pdf_url":"https://arxiv.org/pdf/2309.06814v1.pdf","comment":"This Paper Presented in the International Conference on FOSS\n Approaches towards Computational Intelligence and Language TTechnolog on\n February 2023, Thiruvananthapuram"},{"id":"http://arxiv.org/abs/2309.06794v1","updated":"2023-09-13T08:33:09Z","published":"2023-09-13T08:33:09Z","title":"Cognitive Mirage: A Review of Hallucinations in Large Language Models","summary":" As large language models continue to develop in the field of AI, text\ngeneration systems are susceptible to a worrisome phenomenon known as\nhallucination. In this study, we summarize recent compelling insights into\nhallucinations in LLMs. We present a novel taxonomy of hallucinations from\nvarious text generation tasks, thus provide theoretical insights, detection\nmethods and improvement approaches. Based on this, future research directions\nare proposed. Our contribution are threefold: (1) We provide a detailed and\ncomplete taxonomy for hallucinations appearing in text generation tasks; (2) We\nprovide theoretical analyses of hallucinations in LLMs and provide existing\ndetection and improvement methods; (3) We propose several research directions\nthat can be developed in the future. As hallucinations garner significant\nattention from the community, we will maintain updates on relevant research\nprogress.\n","authors":["Hongbin Ye","Tong Liu","Aijia Zhang","Wei Hua","Weiqiang Jia"],"pdf_url":"https://arxiv.org/pdf/2309.06794v1.pdf","comment":"work in progress; 21 pages"},{"id":"http://arxiv.org/abs/2302.07267v6","updated":"2023-09-13T07:44:42Z","published":"2023-02-13T17:57:50Z","title":"Diminished Diversity-of-Thought in a Standard Large Language Model","summary":" We test whether Large Language Models (LLMs) can be used to simulate human\nparticipants in social-science studies. To do this, we run replications of 14\nstudies from the Many Labs 2 replication project with OpenAI's text-davinci-003\nmodel, colloquially known as GPT3.5. Based on our pre-registered analyses, we\nfind that among the eight studies we could analyse, our GPT sample replicated\n37.5% of the original results and 37.5% of the Many Labs 2 results. However, we\nwere unable to analyse the remaining six studies due to an unexpected\nphenomenon we call the \"correct answer\" effect. Different runs of GPT3.5\nanswered nuanced questions probing political orientation, economic preference,\njudgement, and moral philosophy with zero or near-zero variation in responses:\nwith the supposedly \"correct answer.\" In one exploratory follow-up study, we\nfound that a \"correct answer\" was robust to changing the demographic details\nthat precede the prompt. In another, we found that most but not all \"correct\nanswers\" were robust to changing the order of answer choices. One of our most\nstriking findings occurred in our replication of the Moral Foundations Theory\nsurvey results, where we found GPT3.5 identifying as a political conservative\nin 99.6% of the cases, and as a liberal in 99.3% of the cases in the\nreverse-order condition. However, both self-reported 'GPT conservatives' and\n'GPT liberals' showed right-leaning moral foundations. Our results cast doubts\non the validity of using LLMs as a general replacement for human participants\nin the social sciences. Our results also raise concerns that a hypothetical\nAI-led future may be subject to a diminished diversity-of-thought.\n","authors":["Peter S. Park","Philipp Schoenegger","Chongyang Zhu"],"pdf_url":"https://arxiv.org/pdf/2302.07267v6.pdf","comment":"67 pages (42-page main text, 25-page SI); 12 visualizations (four\n tables and three figures in the main text, five figures in the SI);\n additional exploratory follow-up study varied the demographic details\n preceding the prompt; preregistered OSF database is available at\n https://osf.io/dzp8t/"},{"id":"http://arxiv.org/abs/2309.06759v1","updated":"2023-09-13T07:12:31Z","published":"2023-09-13T07:12:31Z","title":"Scaled Prompt-Tuning for Few-Shot Natural Language Generation","summary":" The increasingly Large Language Models (LLMs) demonstrate stronger language\nunderstanding and generation capabilities, while the memory demand and\ncomputation cost of fine-tuning LLMs on downstream tasks are non-negligible.\nBesides, fine-tuning generally requires a certain amount of data from\nindividual tasks whilst data collection cost is another issue to consider in\nreal-world applications. In this work, we focus on Parameter-Efficient\nFine-Tuning (PEFT) methods for few-shot Natural Language Generation (NLG),\nwhich freeze most parameters in LLMs and tune a small subset of parameters in\nfew-shot cases so that memory footprint, training cost, and labeling cost are\nreduced while maintaining or even improving the performance. We propose a\nScaled Prompt-Tuning (SPT) method which surpasses conventional PT with better\nperformance and generalization ability but without an obvious increase in\ntraining cost. Further study on intermediate SPT suggests the superior\ntransferability of SPT in few-shot scenarios, providing a recipe for\ndata-deficient and computation-limited circumstances. Moreover, a comprehensive\ncomparison of existing PEFT methods reveals that certain approaches exhibiting\ndecent performance with modest training cost such as Prefix-Tuning in prior\nstudy could struggle in few-shot NLG tasks, especially on challenging datasets.\n","authors":["Ting Hu","Christoph Meinel","Haojin Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06748v1","updated":"2023-09-13T06:40:24Z","published":"2023-09-13T06:40:24Z","title":"CONVERSER: Few-Shot Conversational Dense Retrieval with Synthetic Data\n Generation","summary":" Conversational search provides a natural interface for information retrieval\n(IR). Recent approaches have demonstrated promising results in applying dense\nretrieval to conversational IR. However, training dense retrievers requires\nlarge amounts of in-domain paired data. This hinders the development of\nconversational dense retrievers, as abundant in-domain conversations are\nexpensive to collect. In this paper, we propose CONVERSER, a framework for\ntraining conversational dense retrievers with at most 6 examples of in-domain\ndialogues. Specifically, we utilize the in-context learning capability of large\nlanguage models to generate conversational queries given a passage in the\nretrieval corpus. Experimental results on conversational retrieval benchmarks\nOR-QuAC and TREC CAsT 19 show that the proposed CONVERSER achieves comparable\nperformance to fully-supervised models, demonstrating the effectiveness of our\nproposed framework in few-shot conversational dense retrieval. All source code\nand generated datasets are available at https://github.com/MiuLab/CONVERSER\n","authors":["Chao-Wei Huang","Chen-Yu Hsu","Tsu-Yuan Hsu","Chen-An Li","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06748v1.pdf","comment":"Accepted to SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2309.06726v1","updated":"2023-09-13T05:02:11Z","published":"2023-09-13T05:02:11Z","title":"Enhancing Keyphrase Generation by BART Finetuning with Splitting and\n Shuffling","summary":" Keyphrase generation is a task of identifying a set of phrases that best\nrepre-sent the main topics or themes of a given text. Keyphrases are dividend\nint pre-sent and absent keyphrases. Recent approaches utilizing\nsequence-to-sequence models show effectiveness on absent keyphrase generation.\nHowever, the per-formance is still limited due to the hardness of finding\nabsent keyphrases. In this paper, we propose Keyphrase-Focused BART, which\nexploits the differ-ences between present and absent keyphrase generations, and\nperforms fine-tuning of two separate BART models for present and absent\nkeyphrases. We further show effective approaches of shuffling keyphrases and\ncandidate keyphrase ranking. For absent keyphrases, our Keyphrase-Focused BART\nachieved new state-of-the-art score on F1@5 in two out of five keyphrase\ngen-eration benchmark datasets.\n","authors":["Bin Chen","Mizuho Iwaihara"],"pdf_url":"https://arxiv.org/pdf/2309.06726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07848v8","updated":"2023-09-13T04:48:23Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Accurate Speech Emotion Recognition","summary":" Contrastive cross-modality pretraining has recently exhibited impressive\nsuccess in diverse fields, whereas there is limited research on their merits in\nspeech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind\nof gender-attribute-enhanced contrastive language-audio pretraining (CLAP)\nmethod for SER. Specifically, we first construct an effective emotion CLAP\n(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given\nthe significance of gender information in SER, two novel multi-task learning\nbased GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP)\nmodels are further proposed to incorporate gender information of speech\nsignals, forming more reasonable objectives. Experiments on IEMOCAP indicate\nthat our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with\ndifferent pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP\nobtains the best UAR of 81.43% and WAR of 83.16%, which performs better than\nstate-of-the-art SER methods by at least 3%. Our system is open-sourced on\nGithub.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Wen Fei","Jixun Yao","Heng Lu","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.07848v8.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2309.06706v1","updated":"2023-09-13T04:06:47Z","published":"2023-09-13T04:06:47Z","title":"Simultaneous Machine Translation with Large Language Models","summary":" Large language models (LLM) have demonstrated their abilities to solve\nvarious natural language processing tasks through dialogue-based interactions.\nFor instance, research indicates that LLMs can achieve competitive performance\nin offline machine translation tasks for high-resource languages. However,\napplying LLMs to simultaneous machine translation (SimulMT) poses many\nchallenges, including issues related to the training-inference mismatch arising\nfrom different decoding patterns. In this paper, we explore the feasibility of\nutilizing LLMs for SimulMT. Building upon conventional approaches, we introduce\na simple yet effective mixture policy that enables LLMs to engage in SimulMT\nwithout requiring additional training. Furthermore, after Supervised\nFine-Tuning (SFT) on a mixture of full and prefix sentences, the model exhibits\nsignificant performance improvements. Our experiments, conducted with\nLlama2-7B-chat on nine language pairs from the MUST-C dataset, demonstrate that\nLLM can achieve translation quality and latency comparable to dedicated SimulMT\nmodels.\n","authors":["Minghan Wang","Jinming Zhao","Thuy-Trang Vu","Fatemeh Shiri","Ehsan Shareghi","Gholamreza Haffari"],"pdf_url":"https://arxiv.org/pdf/2309.06706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06703v1","updated":"2023-09-13T04:02:38Z","published":"2023-09-13T04:02:38Z","title":"VLSlice: Interactive Vision-and-Language Slice Discovery","summary":" Recent work in vision-and-language demonstrates that large-scale pretraining\ncan learn generalizable models that are efficiently transferable to downstream\ntasks. While this may improve dataset-scale aggregate metrics, analyzing\nperformance around hand-crafted subgroups targeting specific bias dimensions\nreveals systemic undesirable behaviors. However, this subgroup analysis is\nfrequently stalled by annotation efforts, which require extensive time and\nresources to collect the necessary data. Prior art attempts to automatically\ndiscover subgroups to circumvent these constraints but typically leverages\nmodel behavior on existing task-specific annotations and rapidly degrades on\nmore complex inputs beyond \"tabular\" data, none of which study\nvision-and-language models. This paper presents VLSlice, an interactive system\nenabling user-guided discovery of coherent representation-level subgroups with\nconsistent visiolinguistic behavior, denoted as vision-and-language slices,\nfrom unlabeled image sets. We show that VLSlice enables users to quickly\ngenerate diverse high-coherency slices in a user study (n=22) and release the\ntool publicly.\n","authors":["Eric Slyman","Minsuk Kahng","Stefan Lee"],"pdf_url":"https://arxiv.org/pdf/2309.06703v1.pdf","comment":"Conference paper at ICCV 2023. 17 pages, 11 figures.\n https://ericslyman.com/vlslice/"},{"id":"http://arxiv.org/abs/2308.01825v2","updated":"2023-09-13T03:57:29Z","published":"2023-08-03T15:34:01Z","title":"Scaling Relationship on Learning Mathematical Reasoning with Large\n Language Models","summary":" Mathematical reasoning is a challenging task for large language models\n(LLMs), while the scaling relationship of it with respect to LLM capacity is\nunder-explored. In this paper, we investigate how the pre-training loss,\nsupervised data amount, and augmented data amount influence the reasoning\nperformances of a supervised LLM. We find that pre-training loss is a better\nindicator of the model's performance than the model's parameter count. We apply\nsupervised fine-tuning (SFT) with different amounts of supervised data and\nempirically find a log-linear relation between data amount and model\nperformance, and we find better models improve less with enlarged supervised\ndatasets. To augment more data samples for improving model performances without\nany human effort, we propose to apply Rejection sampling Fine-Tuning (RFT). RFT\nuses supervised models to generate and collect correct reasoning paths as\naugmented fine-tuning datasets. We find with augmented samples containing more\ndistinct reasoning paths, RFT improves mathematical reasoning performance more\nfor LLMs. We also find RFT brings more improvement for less performant LLMs.\nFurthermore, we combine rejection samples from multiple models which push\nLLaMA-7B to an accuracy of 49.3\\% on GSM8K which outperforms the supervised\nfine-tuning (SFT) accuracy of 35.9\\% significantly.\n","authors":["Zheng Yuan","Hongyi Yuan","Chengpeng Li","Guanting Dong","Keming Lu","Chuanqi Tan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.01825v2.pdf","comment":"Working in Progress"},{"id":"http://arxiv.org/abs/2309.06175v2","updated":"2023-09-13T03:53:43Z","published":"2023-09-12T12:37:37Z","title":"AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity\n Recognition and Linking","summary":" This paper presents a novel approach to address the Entity Recognition and\nLinking Challenge at NLPCC 2015. The task involves extracting named entity\nmentions from short search queries and linking them to entities within a\nreference Chinese knowledge base. To tackle this problem, we first expand the\nexisting knowledge base and utilize external knowledge to identify candidate\nentities, thereby improving the recall rate. Next, we extract features from the\ncandidate entities and utilize Support Vector Regression and Multiple Additive\nRegression Tree as scoring functions to filter the results. Additionally, we\napply rules to further refine the results and enhance precision. Our method is\ncomputationally efficient and achieves an F1 score of 0.535.\n","authors":["Di Lu","Zhongping Liang","Caixia Yuan","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06175v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06698v1","updated":"2023-09-13T03:42:28Z","published":"2023-09-13T03:42:28Z","title":"Benchmarking Procedural Language Understanding for Low-Resource\n Languages: A Case Study on Turkish","summary":" Understanding procedural natural language (e.g., step-by-step instructions)\nis a crucial step to execution and planning. However, while there are ample\ncorpora and downstream tasks available in English, the field lacks such\nresources for most languages. To address this gap, we conduct a case study on\nTurkish procedural texts. We first expand the number of tutorials in Turkish\nwikiHow from 2,000 to 52,000 using automated translation tools, where the\ntranslation quality and loyalty to the original meaning are validated by a team\nof experts on a random set. Then, we generate several downstream tasks on the\ncorpus, such as linking actions, goal inference, and summarization. To tackle\nthese tasks, we implement strong baseline models via fine-tuning large\nlanguage-specific models such as TR-BART and BERTurk, as well as multilingual\nmodels such as mBART, mT5, and XLM. We find that language-specific models\nconsistently outperform their multilingual models by a significant margin\nacross most procedural language understanding (PLU) tasks. We release our\ncorpus, downstream tasks and the baseline models with https://github.com/\nGGLAB-KU/turkish-plu.\n","authors":["Arda Uzunoğlu","Gözde Gül Şahin"],"pdf_url":"https://arxiv.org/pdf/2309.06698v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2308.03266v3","updated":"2023-09-13T03:14:47Z","published":"2023-08-07T03:12:27Z","title":"SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and\n Effective Hotword Customization Ability","summary":" Hotword customization is one of the concerned issues remained in ASR field -\nit is of value to enable users of ASR systems to customize names of entities,\npersons and other phrases to obtain better experience. The past few years have\nseen effective modeling strategies for ASR contextualization developed, but\nthey still exhibit space for improvement about training stability and the\ninvisible activation process. In this paper we propose Semantic-Augmented\nContextual-Paraformer (SeACo-Paraformer) a novel NAR based ASR system with\nflexible and effective hotword customization ability. It possesses the\nadvantages of AED-based model's accuracy, NAR model's efficiency, and explicit\ncustomization capacity of superior performance. Through extensive experiments\nwith 50,000 hours of industrial big data, our proposed model outperforms strong\nbaselines in customization. Besides, we explore an efficient way to filter\nlarge-scale incoming hotwords for further improvement. The industrial models\ncompared, source codes and two hotword test sets are all open source.\n","authors":["Xian Shi","Yexin Yang","Zerui Li","Yanni Chen","Zhifu Gao","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.03266v3.pdf","comment":"submitted to ICASSP2024"},{"id":"http://arxiv.org/abs/2309.05210v2","updated":"2023-09-13T01:59:12Z","published":"2023-09-11T02:58:32Z","title":"Understanding the Impact of Post-Training Quantization on Large Language\n Models","summary":" Large language models (LLMs) are rapidly increasing in size, with the number\nof parameters becoming a key factor in the success of many commercial models,\nsuch as ChatGPT, Claude, and Bard. Even the recently released publicly\naccessible models for commercial usage, such as Falcon and Llama2, come\nequipped with billions of parameters. This significant increase in the number\nof parameters makes deployment and operation very costly. The remarkable\nprogress in the field of quantization for large neural networks in general and\nLLMs in particular, has made these models more accessible by enabling them to\nbe deployed on consumer-grade GPUs. Quantized models generally demonstrate\ncomparable performance levels to their unquantized base counterparts.\nNonetheless, there exists a notable gap in our comprehensive understanding of\nhow these quantized models respond to hyperparameters, such as temperature, max\nnew tokens, and topk, particularly for next word prediction. The present\nanalysis reveals that nf4 and fp4 are equally proficient 4-bit quantization\ntechniques, characterized by similar attributes such as inference speed, memory\nconsumption, and the quality of generated content. Nevertheless, these\nquantization methods exhibit distinct behaviors at varying temperature\nsettings, both in the context of smaller and larger models. It is noteworthy\nthat, in general, 4-bit quantized models of varying sizes exhibit heightened\nsensitivity to lower temperature settings, unlike their unquantized\ncounterparts. Additionally, int8 quantization is associated with significantly\nslower inference speeds, whereas unquantized fp16 models consistently yield the\nfastest inference speeds across models of all sizes.\n","authors":["Somnath Roy"],"pdf_url":"https://arxiv.org/pdf/2309.05210v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06553v1","updated":"2023-09-13T01:12:52Z","published":"2023-09-13T01:12:52Z","title":"Offline Prompt Evaluation and Optimization with Inverse Reinforcement\n Learning","summary":" The recent advances in the development of Large Language Models (LLMs) like\nChatGPT have achieved remarkable performance by leveraging human expertise.\nYet, fully eliciting LLMs' potential for complex tasks requires navigating the\nvast search space of natural language prompts. While prompt engineering has\nshown promise, the requisite human-crafted prompts in trial-and-error attempts\nand the associated costs pose significant challenges. Crucially, the efficiency\nof prompt optimization hinges on the costly procedure of prompt evaluation.\nThis work introduces Prompt-OIRL, an approach rooted in offline inverse\nreinforcement learning that seeks to bridge the gap between effective prompt\nevaluation and affordability. Our method draws on offline datasets from expert\nevaluations, employing Inverse-RL to derive a reward model for offline,\nquery-dependent prompt evaluations. The advantages of Prompt-OIRL are manifold:\nit predicts prompt performance, is cost-efficient, produces human-readable\nresults, and efficiently navigates the prompt space. We validate our method\nacross four LLMs and three arithmetic datasets, highlighting its potential as a\nrobust and effective tool for offline prompt evaluation and optimization. Our\ncode as well as the offline datasets are released, and we highlight the\nPrompt-OIRL can be reproduced within a few hours using a single laptop using\nCPU\n","authors":["Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.06553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06657v1","updated":"2023-09-13T01:07:25Z","published":"2023-09-13T01:07:25Z","title":"Statistical Rejection Sampling Improves Preference Optimization","summary":" Improving the alignment of language models with human preferences remains an\nactive research challenge. Previous approaches have primarily utilized\nReinforcement Learning from Human Feedback (RLHF) via online RL methods such as\nProximal Policy Optimization (PPO). Recently, offline methods such as Sequence\nLikelihood Calibration (SLiC) and Direct Preference Optimization (DPO) have\nemerged as attractive alternatives, offering improvements in stability and\nscalability while maintaining competitive performance. SLiC refines its loss\nfunction using sequence pairs sampled from a supervised fine-tuned (SFT)\npolicy, while DPO directly optimizes language models based on preference data,\nforegoing the need for a separate reward model. However, the maximum likelihood\nestimator (MLE) of the target optimal policy requires labeled preference pairs\nsampled from that policy. DPO's lack of a reward model constrains its ability\nto sample preference pairs from the optimal policy, and SLiC is restricted to\nsampling preference pairs only from the SFT policy. To address these\nlimitations, we introduce a novel approach called Statistical Rejection\nSampling Optimization (RSO) that aims to source preference data from the target\noptimal policy using rejection sampling, enabling a more accurate estimation of\nthe optimal policy. We also propose a unified framework that enhances the loss\nfunctions used in both SLiC and DPO from a preference modeling standpoint.\nThrough extensive experiments across three diverse tasks, we demonstrate that\nRSO consistently outperforms both SLiC and DPO on evaluations from both Large\nLanguage Model (LLM) and human raters.\n","authors":["Tianqi Liu","Yao Zhao","Rishabh Joshi","Misha Khalman","Mohammad Saleh","Peter J. Liu","Jialu Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00184v2","updated":"2023-09-13T22:37:29Z","published":"2023-07-01T00:58:51Z","title":"Personality Traits in Large Language Models","summary":" The advent of large language models (LLMs) has revolutionized natural\nlanguage processing, enabling the generation of coherent and contextually\nrelevant human-like text. As LLMs increasingly power conversational agents used\nby the general public world-wide, the synthetic personality embedded in these\nmodels, by virtue of training on large amounts of human data, is becoming\nincreasingly important. Since personality is a key factor determining the\neffectiveness of communication, we present a comprehensive method for\nadministering and validating personality tests on widely-used LLMs, as well as\nfor shaping personality in the generated text of such LLMs. Applying this\nmethod, we found: 1) personality measurements in the outputs of some LLMs under\nspecific prompting configurations are reliable and valid; 2) evidence of\nreliability and validity of synthetic LLM personality is stronger for larger\nand instruction fine-tuned models; and 3) personality in LLM outputs can be\nshaped along desired dimensions to mimic specific human personality profiles.\nWe discuss application and ethical implications of the measurement and shaping\nmethod, in particular regarding responsible AI.\n","authors":["Greg Serapio-García","Mustafa Safdari","Clément Crepy","Luning Sun","Stephen Fitz","Peter Romero","Marwa Abdulhai","Aleksandra Faust","Maja Matarić"],"pdf_url":"https://arxiv.org/pdf/2307.00184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07334v1","updated":"2023-09-13T22:08:12Z","published":"2023-09-13T22:08:12Z","title":"Learning from Auxiliary Sources in Argumentative Revision Classification","summary":" We develop models to classify desirable reasoning revisions in argumentative\nwriting. We explore two approaches -- multi-task learning and transfer learning\n-- to take advantage of auxiliary sources of revision data for similar tasks.\nResults of intrinsic and extrinsic evaluations show that both approaches can\nindeed improve classifier performance over baselines. While multi-task learning\nshows that training on different sources of data at the same time may improve\nperformance, transfer-learning better represents the relationship between the\ndata.\n","authors":["Tazin Afrin","Diane Litman"],"pdf_url":"https://arxiv.org/pdf/2309.07334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07315v1","updated":"2023-09-13T21:01:03Z","published":"2023-09-13T21:01:03Z","title":"Traveling Words: A Geometric Interpretation of Transformers","summary":" Transformers have significantly advanced the field of natural language\nprocessing, but comprehending their internal mechanisms remains a challenge. In\nthis paper, we introduce a novel geometric perspective that elucidates the\ninner mechanisms of transformer operations. Our primary contribution is\nillustrating how layer normalization confines the latent features to a\nhyper-sphere, subsequently enabling attention to mold the semantic\nrepresentation of words on this surface. This geometric viewpoint seamlessly\nconnects established properties such as iterative refinement and contextual\nembeddings. We validate our insights by probing a pre-trained 124M parameter\nGPT-2 model. Our findings reveal clear query-key attention patterns in early\nlayers and build upon prior observations regarding the subject-specific nature\nof attention heads at deeper layers. Harnessing these geometric insights, we\npresent an intuitive understanding of transformers, depicting them as processes\nthat model the trajectory of word particles along the hyper-sphere.\n","authors":["Raul Molina"],"pdf_url":"https://arxiv.org/pdf/2309.07315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07311v1","updated":"2023-09-13T20:57:11Z","published":"2023-09-13T20:57:11Z","title":"Sudden Drops in the Loss: Syntax Acquisition, Phase Transitions, and\n Simplicity Bias in MLMs","summary":" Most interpretability research in NLP focuses on understanding the behavior\nand features of a fully trained model. However, certain insights into model\nbehavior may only be accessible by observing the trajectory of the training\nprocess. In this paper, we present a case study of syntax acquisition in masked\nlanguage models (MLMs). Our findings demonstrate how analyzing the evolution of\ninterpretable artifacts throughout training deepens our understanding of\nemergent behavior. In particular, we study Syntactic Attention Structure (SAS),\na naturally emerging property of MLMs wherein specific Transformer heads tend\nto focus on specific syntactic relations. We identify a brief window in\ntraining when models abruptly acquire SAS and find that this window is\nconcurrent with a steep drop in loss. Moreover, SAS precipitates the subsequent\nacquisition of linguistic capabilities. We then examine the causal role of SAS\nby introducing a regularizer to manipulate SAS during training, and demonstrate\nthat SAS is necessary for the development of grammatical capabilities. We\nfurther find that SAS competes with other beneficial traits and capabilities\nduring training, and that briefly suppressing SAS can improve model quality.\nThese findings reveal a real-world example of the relationship between\ndisadvantageous simplicity bias and interpretable breakthrough training\ndynamics.\n","authors":["Angelica Chen","Ravid Schwartz-Ziv","Kyunghyun Cho","Matthew L. Leavitt","Naomi Saphra"],"pdf_url":"https://arxiv.org/pdf/2309.07311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10403v3","updated":"2023-09-13T20:35:45Z","published":"2023-05-17T17:46:53Z","title":"PaLM 2 Technical Report","summary":" We introduce PaLM 2, a new state-of-the-art language model that has better\nmultilingual and reasoning capabilities and is more compute-efficient than its\npredecessor PaLM. PaLM 2 is a Transformer-based model trained using a mixture\nof objectives. Through extensive evaluations on English and multilingual\nlanguage, and reasoning tasks, we demonstrate that PaLM 2 has significantly\nimproved quality on downstream tasks across different model sizes, while\nsimultaneously exhibiting faster and more efficient inference compared to PaLM.\nThis improved efficiency enables broader deployment while also allowing the\nmodel to respond faster, for a more natural pace of interaction. PaLM 2\ndemonstrates robust reasoning capabilities exemplified by large improvements\nover PaLM on BIG-Bench and other reasoning tasks. PaLM 2 exhibits stable\nperformance on a suite of responsible AI evaluations, and enables\ninference-time control over toxicity without additional overhead or impact on\nother capabilities. Overall, PaLM 2 achieves state-of-the-art performance\nacross a diverse set of tasks and capabilities.\n When discussing the PaLM 2 family, it is important to distinguish between\npre-trained models (of various sizes), fine-tuned variants of these models, and\nthe user-facing products that use these models. In particular, user-facing\nproducts typically include additional pre- and post-processing steps.\nAdditionally, the underlying models may evolve over time. Therefore, one should\nnot expect the performance of user-facing products to exactly match the results\nreported in this report.\n","authors":["Rohan Anil","Andrew M. Dai","Orhan Firat","Melvin Johnson","Dmitry Lepikhin","Alexandre Passos","Siamak Shakeri","Emanuel Taropa","Paige Bailey","Zhifeng Chen","Eric Chu","Jonathan H. Clark","Laurent El Shafey","Yanping Huang","Kathy Meier-Hellstern","Gaurav Mishra","Erica Moreira","Mark Omernick","Kevin Robinson","Sebastian Ruder","Yi Tay","Kefan Xiao","Yuanzhong Xu","Yujing Zhang","Gustavo Hernandez Abrego","Junwhan Ahn","Jacob Austin","Paul Barham","Jan Botha","James Bradbury","Siddhartha Brahma","Kevin Brooks","Michele Catasta","Yong Cheng","Colin Cherry","Christopher A. Choquette-Choo","Aakanksha Chowdhery","Clément Crepy","Shachi Dave","Mostafa Dehghani","Sunipa Dev","Jacob Devlin","Mark Díaz","Nan Du","Ethan Dyer","Vlad Feinberg","Fangxiaoyu Feng","Vlad Fienber","Markus Freitag","Xavier Garcia","Sebastian Gehrmann","Lucas Gonzalez","Guy Gur-Ari","Steven Hand","Hadi Hashemi","Le Hou","Joshua Howland","Andrea Hu","Jeffrey Hui","Jeremy Hurwitz","Michael Isard","Abe Ittycheriah","Matthew Jagielski","Wenhao Jia","Kathleen Kenealy","Maxim Krikun","Sneha Kudugunta","Chang Lan","Katherine Lee","Benjamin Lee","Eric Li","Music Li","Wei Li","YaGuang Li","Jian Li","Hyeontaek Lim","Hanzhao Lin","Zhongtao Liu","Frederick Liu","Marcello Maggioni","Aroma Mahendru","Joshua Maynez","Vedant Misra","Maysam Moussalem","Zachary Nado","John Nham","Eric Ni","Andrew Nystrom","Alicia Parrish","Marie Pellat","Martin Polacek","Alex Polozov","Reiner Pope","Siyuan Qiao","Emily Reif","Bryan Richter","Parker Riley","Alex Castro Ros","Aurko Roy","Brennan Saeta","Rajkumar Samuel","Renee Shelby","Ambrose Slone","Daniel Smilkov","David R. So","Daniel Sohn","Simon Tokumine","Dasha Valter","Vijay Vasudevan","Kiran Vodrahalli","Xuezhi Wang","Pidong Wang","Zirui Wang","Tao Wang","John Wieting","Yuhuai Wu","Kelvin Xu","Yunhan Xu","Linting Xue","Pengcheng Yin","Jiahui Yu","Qiao Zhang","Steven Zheng","Ce Zheng","Weikang Zhou","Denny Zhou","Slav Petrov","Yonghui Wu"],"pdf_url":"https://arxiv.org/pdf/2305.10403v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05475v2","updated":"2023-09-13T20:30:22Z","published":"2023-09-11T14:16:27Z","title":"Zero-shot Learning with Minimum Instruction to Extract Social\n Determinants and Family History from Clinical Notes using GPT Model","summary":" Demographics, Social determinants of health, and family history documented in\nthe unstructured text within the electronic health records are increasingly\nbeing studied to understand how this information can be utilized with the\nstructured data to improve healthcare outcomes. After the GPT models were\nreleased, many studies have applied GPT models to extract this information from\nthe narrative clinical notes. Different from the existing work, our research\nfocuses on investigating the zero-shot learning on extracting this information\ntogether by providing minimum information to the GPT model. We utilize\nde-identified real-world clinical notes annotated for demographics, various\nsocial determinants, and family history information. Given that the GPT model\nmight provide text different from the text in the original data, we explore two\nsets of evaluation metrics, including the traditional NER evaluation metrics\nand semantic similarity evaluation metrics, to completely understand the\nperformance. Our results show that the GPT-3.5 method achieved an average of\n0.975 F1 on demographics extraction, 0.615 F1 on social determinants\nextraction, and 0.722 F1 on family history extraction. We believe these results\ncan be further improved through model fine-tuning or few-shots learning.\nThrough the case studies, we also identified the limitations of the GPT models,\nwhich need to be addressed in future research.\n","authors":["Neel Bhate","Ansh Mittal","Zhe He","Xiao Luo"],"pdf_url":"https://arxiv.org/pdf/2309.05475v2.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2306.05064v2","updated":"2023-09-13T19:33:18Z","published":"2023-06-08T09:29:05Z","title":"K2: A Foundation Language Model for Geoscience Knowledge Understanding\n and Utilization","summary":" Large language models (LLMs) have achieved great success in general domains\nof natural language processing. In this paper, we bring LLMs to the realm of\ngeoscience with the objective of advancing research and applications in this\nfield. To this end, we present the first-ever LLM in geoscience, K2, alongside\na suite of resources developed to further promote LLM research within\ngeoscience. For instance, we have curated the first geoscience instruction\ntuning dataset, GeoSignal, which aims to align LLM responses to\ngeoscience-related user queries. Additionally, we have established the first\ngeoscience benchmark, GeoBench, to evaluate LLMs in the context of geoscience.\nIn this work, we experiment with a complete recipe to adapt a pre-trained\ngeneral-domain LLM to the geoscience domain. Specifically, we further train the\nLLaMA-7B model on 5.5B tokens of geoscience text corpus, including over 1\nmillion pieces of geoscience literature, and utilize GeoSignal's supervised\ndata to fine-tune the model. Moreover, we share a protocol that can efficiently\ngather domain-specific data and construct domain-supervised data, even in\nsituations where manpower is scarce. Meanwhile, we equip K2 with the abilities\nof using tools to be a naive geoscience aide. Experiments conducted on the\nGeoBench demonstrate the effectiveness of our approach and datasets on\ngeoscience knowledge understanding and utilization.We open-source all the\ntraining data and K2 model checkpoints at https://github.com/davendw49/k2.\n","authors":["Cheng Deng","Tianhang Zhang","Zhongmou He","Yi Xu","Qiyuan Chen","Yuanyuan Shi","Luoyi Fu","Weinan Zhang","Xinbing Wang","Chenghu Zhou","Zhouhan Lin","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2306.05064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09960v3","updated":"2023-09-13T18:52:02Z","published":"2023-04-19T20:45:01Z","title":"A Latent Space Theory for Emergent Abilities in Large Language Models","summary":" Languages are not created randomly but rather to communicate information.\nThere is a strong association between languages and their underlying meanings,\nresulting in a sparse joint distribution that is heavily peaked according to\ntheir correlations. Moreover, these peak values happen to match with the\nmarginal distribution of languages due to the sparsity. With the advent of LLMs\ntrained on big data and large models, we can now precisely assess the marginal\ndistribution of languages, providing a convenient means of exploring the sparse\nstructures in the joint distribution for effective inferences. In this paper,\nwe categorize languages as either unambiguous or {\\epsilon}-ambiguous and\npresent quantitative results to demonstrate that the emergent abilities of\nLLMs, such as language understanding, in-context learning, chain-of-thought\nprompting, and effective instruction fine-tuning, can all be attributed to\nBayesian inference on the sparse joint distribution of languages.\n","authors":["Hui Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.09960v3.pdf","comment":"17 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.07251v1","updated":"2023-09-13T18:39:08Z","published":"2023-09-13T18:39:08Z","title":"In-Contextual Bias Suppression for Large Language Models","summary":" Despite their impressive performance in a wide range of NLP tasks, Large\nLanguage Models (LLMs) have been reported to encode worrying-levels of gender\nbias. Prior work has proposed debiasing methods that require human labelled\nexamples, data augmentation and fine-tuning of the LLMs, which are\ncomputationally costly. Moreover, one might not even have access to the\ninternal parameters for performing debiasing such as in the case of\ncommercially available LLMs such as GPT-4. To address this challenge we propose\nbias suppression, a novel alternative to debiasing that does not require access\nto model parameters. We show that text-based preambles, generated from manually\ndesigned templates covering counterfactual statements, can accurately suppress\ngender biases in LLMs. Moreover, we find that descriptive sentences for\noccupations can further suppress gender biases. Interestingly, we find that\nbias suppression has a minimal adverse effect on downstream task performance,\nwhile effectively mitigating the gender biases.\n","authors":["Daisuke Oba","Masahiro Kaneko","Danushka Bollegala"],"pdf_url":"https://arxiv.org/pdf/2309.07251v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2308.11764v4","updated":"2023-09-13T18:01:36Z","published":"2023-08-22T20:12:49Z","title":"Halo: Estimation and Reduction of Hallucinations in Open-Source Weak\n Large Language Models","summary":" Large Language Models (LLMs) have revolutionized Natural Language Processing\n(NLP). Although convenient for research and practical applications, open-source\nLLMs with fewer parameters often suffer from severe hallucinations compared to\ntheir larger counterparts. This paper focuses on measuring and reducing\nhallucinations in BLOOM 7B, a representative of such weaker open-source LLMs\nthat are publicly available for research and commercial applications. We\nintroduce HaloCheck, a lightweight BlackBox knowledge-free framework designed\nto quantify the severity of hallucinations in LLMs. Additionally, we explore\ntechniques like knowledge injection and teacher-student approaches to alleviate\nhallucinations in low-parameter LLMs. Our experiments effectively demonstrate\nthe reduction of hallucinations in challenging domains for these LLMs.\n","authors":["Mohamed Elaraby","Mengyin Lu","Jacob Dunn","Xueying Zhang","Yu Wang","Shizhu Liu","Pingchuan Tian","Yuping Wang","Yuxuan Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11764v4.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.07125v1","updated":"2023-09-13T17:59:56Z","published":"2023-09-13T17:59:56Z","title":"Text-Guided Generation and Editing of Compositional 3D Avatars","summary":" Our goal is to create a realistic 3D facial avatar with hair and accessories\nusing only a text description. While this challenge has attracted significant\nrecent interest, existing methods either lack realism, produce unrealistic\nshapes, or do not support editing, such as modifications to the hairstyle. We\nargue that existing methods are limited because they employ a monolithic\nmodeling approach, using a single representation for the head, face, hair, and\naccessories. Our observation is that the hair and face, for example, have very\ndifferent structural qualities that benefit from different representations.\nBuilding on this insight, we generate avatars with a compositional model, in\nwhich the head, face, and upper body are represented with traditional 3D\nmeshes, and the hair, clothing, and accessories with neural radiance fields\n(NeRF). The model-based mesh representation provides a strong geometric prior\nfor the face region, improving realism while enabling editing of the person's\nappearance. By using NeRFs to represent the remaining components, our method is\nable to model and synthesize parts with complex geometry and appearance, such\nas curly hair and fluffy scarves. Our novel system synthesizes these\nhigh-quality compositional avatars from text descriptions. The experimental\nresults demonstrate that our method, Text-guided generation and Editing of\nCompositional Avatars (TECA), produces avatars that are more realistic than\nthose of recent methods while being editable because of their compositional\nnature. For example, our TECA enables the seamless transfer of compositional\nfeatures like hairstyles, scarves, and other accessories between avatars. This\ncapability supports applications such as virtual try-on.\n","authors":["Hao Zhang","Yao Feng","Peter Kulits","Yandong Wen","Justus Thies","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2309.07125v1.pdf","comment":"Home page: https://yfeng95.github.io/teca"},{"id":"http://arxiv.org/abs/2309.07122v1","updated":"2023-09-13T17:57:55Z","published":"2023-09-13T17:57:55Z","title":"Tree-Structured Shading Decomposition","summary":" We study inferring a tree-structured representation from a single image for\nobject shading. Prior work typically uses the parametric or measured\nrepresentation to model shading, which is neither interpretable nor easily\neditable. We propose using the shade tree representation, which combines basic\nshading nodes and compositing methods to factorize object surface shading. The\nshade tree representation enables novice users who are unfamiliar with the\nphysical shading process to edit object shading in an efficient and intuitive\nmanner. A main challenge in inferring the shade tree is that the inference\nproblem involves both the discrete tree structure and the continuous parameters\nof the tree nodes. We propose a hybrid approach to address this issue. We\nintroduce an auto-regressive inference model to generate a rough estimation of\nthe tree structure and node parameters, and then we fine-tune the inferred\nshade tree through an optimization algorithm. We show experiments on synthetic\nimages, captured reflectance, real images, and non-realistic vector drawings,\nallowing downstream applications such as material editing, vectorized shading,\nand relighting. Project website: https://chen-geng.com/inv-shade-trees\n","authors":["Chen Geng","Hong-Xing Yu","Sharon Zhang","Maneesh Agrawala","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2309.07122v1.pdf","comment":"Accepted at ICCV 2023. Project website:\n https://chen-geng.com/inv-shade-trees"},{"id":"http://arxiv.org/abs/2309.07120v1","updated":"2023-09-13T17:57:21Z","published":"2023-09-13T17:57:21Z","title":"Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness\n and Ethics","summary":" Multi-modal large language models (MLLMs) are trained based on large language\nmodels (LLM), with an enhanced capability to comprehend multi-modal inputs and\ngenerate textual responses. While they excel in multi-modal tasks, the pure NLP\nabilities of MLLMs are often underestimated and left untested. In this study,\nwe get out of the box and unveil an intriguing characteristic of MLLMs -- our\npreliminary results suggest that visual instruction tuning, a prevailing\nstrategy for transitioning LLMs into MLLMs, unexpectedly and interestingly\nhelps models attain both improved truthfulness and ethical alignment in the\npure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model\nsurpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one\nmillion human annotations, on TruthfulQA-mc and Ethics benchmarks. Further\nanalysis reveals that the improved alignment can be attributed to the superior\ninstruction quality inherent to visual-text data. In releasing our code at\ngithub.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration\ninto the intrinsic value of visual-text synergies and, in a broader scope,\nmulti-modal interactions in alignment research.\n","authors":["Haoqin Tu","Bingchen Zhao","Chen Wei","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2309.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07117v1","updated":"2023-09-13T17:55:11Z","published":"2023-09-13T17:55:11Z","title":"PILOT: A Pre-Trained Model-Based Continual Learning Toolbox","summary":" While traditional machine learning can effectively tackle a wide range of\nproblems, it primarily operates within a closed-world setting, which presents\nlimitations when dealing with streaming data. As a solution, incremental\nlearning emerges to address real-world scenarios involving new data's arrival.\nRecently, pre-training has made significant advancements and garnered the\nattention of numerous researchers. The strong performance of these pre-trained\nmodels (PTMs) presents a promising avenue for developing continual learning\nalgorithms that can effectively adapt to real-world scenarios. Consequently,\nexploring the utilization of PTMs in incremental learning has become essential.\nThis paper introduces a pre-trained model-based continual learning toolbox\nknown as PILOT. On the one hand, PILOT implements some state-of-the-art\nclass-incremental learning algorithms based on pre-trained models, such as L2P,\nDualPrompt, and CODA-Prompt. On the other hand, PILOT also fits typical\nclass-incremental learning algorithms (e.g., DER, FOSTER, and MEMO) within the\ncontext of pre-trained models to evaluate their effectiveness.\n","authors":["Hai-Long Sun","Da-Wei Zhou","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.07117v1.pdf","comment":"Code is available at https://github.com/sun-hailong/LAMDA-PILOT"},{"id":"http://arxiv.org/abs/2309.07115v1","updated":"2023-09-13T17:45:41Z","published":"2023-09-13T17:45:41Z","title":"Weakly-Supervised Multi-Task Learning for Audio-Visual Speaker\n Verification","summary":" In this paper, we present a methodology for achieving robust multimodal\nperson representations optimized for open-set audio-visual speaker\nverification. Distance Metric Learning (DML) approaches have typically\ndominated this problem space, owing to strong performance on new and unseen\nclasses. In our work, we explored multitask learning techniques to further\nboost performance of the DML approach and show that an auxiliary task with weak\nlabels can increase the compactness of the learned speaker representation. We\nalso extend the Generalized end-to-end loss (GE2E) to multimodal inputs and\ndemonstrate that it can achieve competitive performance in an audio-visual\nspace. Finally, we introduce a non-synchronous audio-visual sampling random\nstrategy during training time that has shown to improve generalization. Our\nnetwork achieves state of the art performance for speaker verification,\nreporting 0.244%, 0.252%, 0.441% Equal Error Rate (EER) on the three official\ntrial lists of VoxCeleb1-O/E/H, which is to our knowledge, the best published\nresults on VoxCeleb1-E and VoxCeleb1-H.\n","authors":["Anith Selvakumar","Homa Fashandi"],"pdf_url":"https://arxiv.org/pdf/2309.07115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07113v1","updated":"2023-09-13T17:37:19Z","published":"2023-09-13T17:37:19Z","title":"Contrastive Deep Encoding Enables Uncertainty-aware\n Machine-learning-assisted Histopathology","summary":" Deep neural network models can learn clinically relevant features from\nmillions of histopathology images. However generating high-quality annotations\nto train such models for each hospital, each cancer type, and each diagnostic\ntask is prohibitively laborious. On the other hand, terabytes of training data\n-- while lacking reliable annotations -- are readily available in the public\ndomain in some cases. In this work, we explore how these large datasets can be\nconsciously utilized to pre-train deep networks to encode informative\nrepresentations. We then fine-tune our pre-trained models on a fraction of\nannotated training data to perform specific downstream tasks. We show that our\napproach can reach the state-of-the-art (SOTA) for patch-level classification\nwith only 1-10% randomly selected annotations compared to other SOTA\napproaches. Moreover, we propose an uncertainty-aware loss function, to\nquantify the model confidence during inference. Quantified uncertainty helps\nexperts select the best instances to label for further training. Our\nuncertainty-aware labeling reaches the SOTA with significantly fewer\nannotations compared to random labeling. Last, we demonstrate how our\npre-trained encoders can surpass current SOTA for whole-slide image\nclassification with weak supervision. Our work lays the foundation for data and\ntask-agnostic pre-trained deep networks with quantified uncertainty.\n","authors":["Nirhoshan Sivaroopan","Chamuditha Jayanga","Chalani Ekanayake","Hasindri Watawana","Jathurshan Pradeepkumar","Mithunjha Anandakumar","Ranga Rodrigo","Chamira U. S. Edussooriya","Dushan N. Wadduwage"],"pdf_url":"https://arxiv.org/pdf/2309.07113v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.07106v1","updated":"2023-09-13T17:25:52Z","published":"2023-09-13T17:25:52Z","title":"Hardening RGB-D Object Recognition Systems against Adversarial Patch\n Attacks","summary":" RGB-D object recognition systems improve their predictive performances by\nfusing color and depth information, outperforming neural network architectures\nthat rely solely on colors. While RGB-D systems are expected to be more robust\nto adversarial examples than RGB-only systems, they have also been proven to be\nhighly vulnerable. Their robustness is similar even when the adversarial\nexamples are generated by altering only the original images' colors. Different\nworks highlighted the vulnerability of RGB-D systems; however, there is a\nlacking of technical explanations for this weakness. Hence, in our work, we\nbridge this gap by investigating the learned deep representation of RGB-D\nsystems, discovering that color features make the function learned by the\nnetwork more complex and, thus, more sensitive to small perturbations. To\nmitigate this problem, we propose a defense based on a detection mechanism that\nmakes RGB-D systems more robust against adversarial examples. We empirically\nshow that this defense improves the performances of RGB-D systems against\nadversarial examples even when they are computed ad-hoc to circumvent this\ndetection mechanism, and that is also more effective than adversarial training.\n","authors":["Yang Zheng","Luca Demetrio","Antonio Emanuele Cinà","Xiaoyi Feng","Zhaoqiang Xia","Xiaoyue Jiang","Ambra Demontis","Battista Biggio","Fabio Roli"],"pdf_url":"https://arxiv.org/pdf/2309.07106v1.pdf","comment":"Accepted for publication in the Information Sciences journal"},{"id":"http://arxiv.org/abs/2309.07104v1","updated":"2023-09-13T17:25:06Z","published":"2023-09-13T17:25:06Z","title":"Polygon Intersection-over-Union Loss for Viewpoint-Agnostic Monocular 3D\n Vehicle Detection","summary":" Monocular 3D object detection is a challenging task because depth information\nis difficult to obtain from 2D images. A subset of viewpoint-agnostic monocular\n3D detection methods also do not explicitly leverage scene homography or\ngeometry during training, meaning that a model trained thusly can detect\nobjects in images from arbitrary viewpoints. Such works predict the projections\nof the 3D bounding boxes on the image plane to estimate the location of the 3D\nboxes, but these projections are not rectangular so the calculation of IoU\nbetween these projected polygons is not straightforward. This work proposes an\nefficient, fully differentiable algorithm for the calculation of IoU between\ntwo convex polygons, which can be utilized to compute the IoU between two 3D\nbounding box footprints viewed from an arbitrary angle. We test the performance\nof the proposed polygon IoU loss (PIoU loss) on three state-of-the-art\nviewpoint-agnostic 3D detection models. Experiments demonstrate that the\nproposed PIoU loss converges faster than L1 loss and that in 3D detection\nmodels, a combination of PIoU loss and L1 loss gives better results than L1\nloss alone (+1.64% AP70 for MonoCon on cars, +0.18% AP70 for RTM3D on cars, and\n+0.83%/+2.46% AP50/AP25 for MonoRCNN on cyclists).\n","authors":["Derek Gloudemans","Xinxuan Lu","Shepard Xia","Daniel B. Work"],"pdf_url":"https://arxiv.org/pdf/2309.07104v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07094v1","updated":"2023-09-13T17:10:23Z","published":"2023-09-13T17:10:23Z","title":"RadarLCD: Learnable Radar-based Loop Closure Detection Pipeline","summary":" Loop Closure Detection (LCD) is an essential task in robotics and computer\nvision, serving as a fundamental component for various applications across\ndiverse domains. These applications encompass object recognition, image\nretrieval, and video analysis. LCD consists in identifying whether a robot has\nreturned to a previously visited location, referred to as a loop, and then\nestimating the related roto-translation with respect to the analyzed location.\nDespite the numerous advantages of radar sensors, such as their ability to\noperate under diverse weather conditions and provide a wider range of view\ncompared to other commonly used sensors (e.g., cameras or LiDARs), integrating\nradar data remains an arduous task due to intrinsic noise and distortion. To\naddress this challenge, this research introduces RadarLCD, a novel supervised\ndeep learning pipeline specifically designed for Loop Closure Detection using\nthe FMCW Radar (Frequency Modulated Continuous Wave) sensor. RadarLCD, a\nlearning-based LCD methodology explicitly designed for radar systems, makes a\nsignificant contribution by leveraging the pre-trained HERO (Hybrid Estimation\nRadar Odometry) model. Being originally developed for radar odometry, HERO's\nfeatures are used to select key points crucial for LCD tasks. The methodology\nundergoes evaluation across a variety of FMCW Radar dataset scenes, and it is\ncompared to state-of-the-art systems such as Scan Context for Place Recognition\nand ICP for Loop Closure. The results demonstrate that RadarLCD surpasses the\nalternatives in multiple aspects of Loop Closure Detection.\n","authors":["Mirko Usuelli","Matteo Frosi","Paolo Cudrano","Simone Mentasti","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2309.07094v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2309.07087v1","updated":"2023-09-13T16:59:50Z","published":"2023-09-13T16:59:50Z","title":"Developing a Novel Image Marker to Predict the Responses of Neoadjuvant\n Chemotherapy (NACT) for Ovarian Cancer Patients","summary":" Objective: Neoadjuvant chemotherapy (NACT) is one kind of treatment for\nadvanced stage ovarian cancer patients. However, due to the nature of tumor\nheterogeneity, the patients' responses to NACT varies significantly among\ndifferent subgroups. To address this clinical challenge, the purpose of this\nstudy is to develop a novel image marker to achieve high accuracy response\nprediction of the NACT at an early stage. Methods: For this purpose, we first\ncomputed a total of 1373 radiomics features to quantify the tumor\ncharacteristics, which can be grouped into three categories: geometric,\nintensity, and texture features. Second, all these features were optimized by\nprincipal component analysis algorithm to generate a compact and informative\nfeature cluster. Using this cluster as the input, an SVM based classifier was\ndeveloped and optimized to create a final marker, indicating the likelihood of\nthe patient being responsive to the NACT treatment. To validate this scheme, a\ntotal of 42 ovarian cancer patients were retrospectively collected. A nested\nleave-one-out cross-validation was adopted for model performance assessment.\nResults: The results demonstrate that the new method yielded an AUC (area under\nthe ROC [receiver characteristic operation] curve) of 0.745. Meanwhile, the\nmodel achieved overall accuracy of 76.2%, positive predictive value of 70%, and\nnegative predictive value of 78.1%. Conclusion: This study provides meaningful\ninformation for the development of radiomics based image markers in NACT\nresponse prediction.\n","authors":["Ke Zhang","Neman Abdoli","Patrik Gilley","Youkabed Sadri","Xuxin Chen","Theresa C. Thai","Lauren Dockery","Kathleen Moore","Robert S. Mannel","Yuchen Qiu"],"pdf_url":"https://arxiv.org/pdf/2309.07087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07085v1","updated":"2023-09-13T16:53:48Z","published":"2023-09-13T16:53:48Z","title":"Mitigating Group Bias in Federated Learning for Heterogeneous Devices","summary":" Federated Learning is emerging as a privacy-preserving model training\napproach in distributed edge applications. As such, most edge deployments are\nheterogeneous in nature i.e., their sensing capabilities and environments vary\nacross deployments. This edge heterogeneity violates the independence and\nidentical distribution (IID) property of local data across clients and produces\nbiased global models i.e. models that contribute to unfair decision-making and\ndiscrimination against a particular community or a group. Existing bias\nmitigation techniques only focus on bias generated from label heterogeneity in\nnon-IID data without accounting for domain variations due to feature\nheterogeneity and do not address global group-fairness property.\n Our work proposes a group-fair FL framework that minimizes group-bias while\npreserving privacy and without resource utilization overhead. Our main idea is\nto leverage average conditional probabilities to compute a cross-domain group\n\\textit{importance weights} derived from heterogeneous training data to\noptimize the performance of the worst-performing group using a modified\nmultiplicative weights update method. Additionally, we propose regularization\ntechniques to minimize the difference between the worst and best-performing\ngroups while making sure through our thresholding mechanism to strike a balance\nbetween bias reduction and group performance degradation. Our evaluation of\nhuman emotion recognition and image classification benchmarks assesses the fair\ndecision-making of our framework in real-world heterogeneous settings.\n","authors":["Khotso Selialia","Yasra Chandio","Fatima M. Anwar"],"pdf_url":"https://arxiv.org/pdf/2309.07085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18691v2","updated":"2023-09-13T16:52:55Z","published":"2023-05-30T02:24:03Z","title":"Edge-MoE: Memory-Efficient Multi-Task Vision Transformer Architecture\n with Task-level Sparsity via Mixture-of-Experts","summary":" Computer vision researchers are embracing two promising paradigms: Vision\nTransformers (ViTs) and Multi-task Learning (MTL), which both show great\nperformance but are computation-intensive, given the quadratic complexity of\nself-attention in ViT and the need to activate an entire large MTL model for\none task. M$^3$ViT is the latest multi-task ViT model that introduces\nmixture-of-experts (MoE), where only a small portion of subnetworks (\"experts\")\nare sparsely and dynamically activated based on the current task. M$^3$ViT\nachieves better accuracy and over 80% computation reduction but leaves\nchallenges for efficient deployment on FPGA.\n Our work, dubbed Edge-MoE, solves the challenges to introduce the first\nend-to-end FPGA accelerator for multi-task ViT with a collection of\narchitectural innovations, including (1) a novel reordering mechanism for\nself-attention, which requires only constant bandwidth regardless of the target\nparallelism; (2) a fast single-pass softmax approximation; (3) an accurate and\nlow-cost GELU approximation; (4) a unified and flexible computing unit that is\nshared by almost all computational layers to maximally reduce resource usage;\nand (5) uniquely for M$^3$ViT, a novel patch reordering method to eliminate\nmemory access overhead. Edge-MoE achieves 2.24x and 4.90x better energy\nefficiency comparing with GPU and CPU, respectively. A real-time video\ndemonstration is available online, along with our open-source code written\nusing High-Level Synthesis.\n","authors":["Rishov Sarkar","Hanxue Liang","Zhiwen Fan","Zhangyang Wang","Cong Hao"],"pdf_url":"https://arxiv.org/pdf/2305.18691v2.pdf","comment":"11 pages, 12 figures. Accepted at ICCAD 2023"},{"id":"http://arxiv.org/abs/2309.07084v1","updated":"2023-09-13T16:52:23Z","published":"2023-09-13T16:52:23Z","title":"SupFusion: Supervised LiDAR-Camera Fusion for 3D Object Detection","summary":" In this paper, we propose a novel training strategy called SupFusion, which\nprovides an auxiliary feature level supervision for effective LiDAR-Camera\nfusion and significantly boosts detection performance. Our strategy involves a\ndata enhancement method named Polar Sampling, which densifies sparse objects\nand trains an assistant model to generate high-quality features as the\nsupervision. These features are then used to train the LiDAR-Camera fusion\nmodel, where the fusion feature is optimized to simulate the generated\nhigh-quality features. Furthermore, we propose a simple yet effective deep\nfusion module, which contiguously gains superior performance compared with\nprevious fusion methods with SupFusion strategy. In such a manner, our proposal\nshares the following advantages. Firstly, SupFusion introduces auxiliary\nfeature-level supervision which could boost LiDAR-Camera detection performance\nwithout introducing extra inference costs. Secondly, the proposed deep fusion\ncould continuously improve the detector's abilities. Our proposed SupFusion and\ndeep fusion module is plug-and-play, we make extensive experiments to\ndemonstrate its effectiveness. Specifically, we gain around 2% 3D mAP\nimprovements on KITTI benchmark based on multiple LiDAR-Camera 3D detectors.\n","authors":["Yiran Qin","Chaoqun Wang","Zijian Kang","Ningning Ma","Zhen Li","Ruimao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07084v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/2309.07068v1","updated":"2023-09-13T16:28:43Z","published":"2023-09-13T16:28:43Z","title":"FAIR: Frequency-aware Image Restoration for Industrial Visual Anomaly\n Detection","summary":" Image reconstruction-based anomaly detection models are widely explored in\nindustrial visual inspection. However, existing models usually suffer from the\ntrade-off between normal reconstruction fidelity and abnormal reconstruction\ndistinguishability, which damages the performance. In this paper, we find that\nthe above trade-off can be better mitigated by leveraging the distinct\nfrequency biases between normal and abnormal reconstruction errors. To this\nend, we propose Frequency-aware Image Restoration (FAIR), a novel\nself-supervised image restoration task that restores images from their\nhigh-frequency components. It enables precise reconstruction of normal patterns\nwhile mitigating unfavorable generalization to anomalies. Using only a simple\nvanilla UNet, FAIR achieves state-of-the-art performance with higher efficiency\non various defect detection datasets. Code: https://github.com/liutongkun/FAIR.\n","authors":["Tongkun Liu","Bing Li","Xiao Du","Bingke Jiang","Leqi Geng","Feiyang Wang","Zhuo Zhao"],"pdf_url":"https://arxiv.org/pdf/2309.07068v1.pdf","comment":"12 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.13933v2","updated":"2023-09-13T16:26:08Z","published":"2023-06-24T10:44:02Z","title":"Boost Video Frame Interpolation via Motion Adaptation","summary":" Video frame interpolation (VFI) is a challenging task that aims to generate\nintermediate frames between two consecutive frames in a video. Existing\nlearning-based VFI methods have achieved great success, but they still suffer\nfrom limited generalization ability due to the limited motion distribution of\ntraining datasets. In this paper, we propose a novel optimization-based VFI\nmethod that can adapt to unseen motions at test time. Our method is based on a\ncycle-consistency adaptation strategy that leverages the motion characteristics\namong video frames. We also introduce a lightweight adapter that can be\ninserted into the motion estimation module of existing pre-trained VFI models\nto improve the efficiency of adaptation. Extensive experiments on various\nbenchmarks demonstrate that our method can boost the performance of two-frame\nVFI models, outperforming the existing state-of-the-art methods, even those\nthat use extra input.\n","authors":["Haoning Wu","Xiaoyun Zhang","Weidi Xie","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2306.13933v2.pdf","comment":"Accepted by BMVC 2023 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2309.07054v1","updated":"2023-09-13T16:12:11Z","published":"2023-09-13T16:12:11Z","title":"Aggregating Long-term Sharp Features via Hybrid Transformers for Video\n Deblurring","summary":" Video deblurring methods, aiming at recovering consecutive sharp frames from\na given blurry video, usually assume that the input video suffers from\nconsecutively blurry frames. However, in real-world blurry videos taken by\nmodern imaging devices, sharp frames usually appear in the given video, thus\nmaking temporal long-term sharp features available for facilitating the\nrestoration of a blurry frame. In this work, we propose a video deblurring\nmethod that leverages both neighboring frames and present sharp frames using\nhybrid Transformers for feature aggregation. Specifically, we first train a\nblur-aware detector to distinguish between sharp and blurry frames. Then, a\nwindow-based local Transformer is employed for exploiting features from\nneighboring frames, where cross attention is beneficial for aggregating\nfeatures from neighboring frames without explicit spatial alignment. To\naggregate long-term sharp features from detected sharp frames, we utilize a\nglobal Transformer with multi-scale matching capability. Moreover, our method\ncan easily be extended to event-driven video deblurring by incorporating an\nevent fusion module into the global Transformer. Extensive experiments on\nbenchmark datasets demonstrate that our proposed method outperforms\nstate-of-the-art video deblurring methods as well as event-driven video\ndeblurring methods in terms of quantitative metrics and visual quality. The\nsource code and trained models are available at\nhttps://github.com/shangwei5/STGTN.\n","authors":["Dongwei Ren","Wei Shang","Yi Yang","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2309.07054v1.pdf","comment":"13 pages, 11 figures, and the code is available at\n https://github.com/shangwei5/STGTN"},{"id":"http://arxiv.org/abs/2309.07021v1","updated":"2023-09-13T15:23:43Z","published":"2023-09-13T15:23:43Z","title":"Exploiting Multiple Priors for Neural 3D Indoor Reconstruction","summary":" Neural implicit modeling permits to achieve impressive 3D reconstruction\nresults on small objects, while it exhibits significant limitations in large\nindoor scenes. In this work, we propose a novel neural implicit modeling method\nthat leverages multiple regularization strategies to achieve better\nreconstructions of large indoor environments, while relying only on images. A\nsparse but accurate depth prior is used to anchor the scene to the initial\nmodel. A dense but less accurate depth prior is also introduced, flexible\nenough to still let the model diverge from it to improve the estimated\ngeometry. Then, a novel self-supervised strategy to regularize the estimated\nsurface normals is presented. Finally, a learnable exposure compensation scheme\npermits to cope with challenging lighting conditions. Experimental results show\nthat our approach produces state-of-the-art 3D reconstructions in challenging\nindoor scenarios.\n","authors":["Federico Lincetto","Gianluca Agresti","Mattia Rossi","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2309.07021v1.pdf","comment":"Accepted at the British Machine Vision Conference (BMVC) 2023"},{"id":"http://arxiv.org/abs/2308.15366v3","updated":"2023-09-13T14:58:14Z","published":"2023-08-29T15:02:53Z","title":"AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language\n Models","summary":" Large Vision-Language Models (LVLMs) such as MiniGPT-4 and LLaVA have\ndemonstrated the capability of understanding images and achieved remarkable\nperformance in various visual tasks. Despite their strong abilities in\nrecognizing common objects due to extensive training datasets, they lack\nspecific domain knowledge and have a weaker understanding of localized details\nwithin objects, which hinders their effectiveness in the Industrial Anomaly\nDetection (IAD) task. On the other hand, most existing IAD methods only provide\nanomaly scores and necessitate the manual setting of thresholds to distinguish\nbetween normal and abnormal samples, which restricts their practical\nimplementation. In this paper, we explore the utilization of LVLM to address\nthe IAD problem and propose AnomalyGPT, a novel IAD approach based on LVLM. We\ngenerate training data by simulating anomalous images and producing\ncorresponding textual descriptions for each image. We also employ an image\ndecoder to provide fine-grained semantic and design a prompt learner to\nfine-tune the LVLM using prompt embeddings. Our AnomalyGPT eliminates the need\nfor manual threshold adjustments, thus directly assesses the presence and\nlocations of anomalies. Additionally, AnomalyGPT supports multi-turn dialogues\nand exhibits impressive few-shot in-context learning capabilities. With only\none normal shot, AnomalyGPT achieves the state-of-the-art performance with an\naccuracy of 86.1%, an image-level AUC of 94.1%, and a pixel-level AUC of 95.3%\non the MVTec-AD dataset. Code is available at\nhttps://github.com/CASIA-IVA-Lab/AnomalyGPT.\n","authors":["Zhaopeng Gu","Bingke Zhu","Guibo Zhu","Yingying Chen","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2308.15366v3.pdf","comment":"Project page: https://anomalygpt.github.io"},{"id":"http://arxiv.org/abs/2301.10670v2","updated":"2023-09-13T14:57:44Z","published":"2023-01-25T16:20:01Z","title":"TMSA: Towards Arbitrary Text-driven Image Manipulation via Space\n Alignment","summary":" The recent GAN inversion methods have been able to successfully invert the\nreal image input to the corresponding editable latent code in StyleGAN. By\ncombining with the language-vision model (CLIP), some text-driven image\nmanipulation methods are proposed. However, these methods require extra costs\nto perform optimization for a certain image or a new attribute editing mode. To\nachieve a more efficient editing method, we propose a new Text-driven image\nManipulation framework via Space Alignment (TMSA). The Space Alignment module\naims to align the same semantic regions in CLIP and StyleGAN spaces. Then, the\ntext input can be directly accessed into the StyleGAN space and be used to find\nthe semantic shift according to the text description. The framework can support\narbitrary image editing mode without additional cost. Our work provides the\nuser with an interface to control the attributes of a given image according to\ntext input and get the result in real time. Ex tensive experiments demonstrate\nour superior performance over prior works.\n","authors":["Yunpeng Bai","Zihan Zhong","Chao Dong","Weichen Zhang","Guowei Xu","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2301.10670v2.pdf","comment":"8 pages, 12 figures"},{"id":"http://arxiv.org/abs/2212.01173v3","updated":"2023-09-13T14:52:30Z","published":"2022-12-02T13:55:41Z","title":"DWRSeg: Rethinking Efficient Acquisition of Multi-scale Contextual\n Information for Real-time Semantic Segmentation","summary":" Many current works directly adopt multi-rate depth-wise dilated convolutions\nto capture multi-scale contextual information simultaneously from one input\nfeature map, thus improving the feature extraction efficiency for real-time\nsemantic segmentation. However, this design may lead to difficult access to\nmulti-scale contextual information because of the unreasonable structure and\nhyperparameters. To lower the difficulty of drawing multi-scale contextual\ninformation, we propose a highly efficient multi-scale feature extraction\nmethod, which decomposes the original single-step method into two steps, Region\nResidualization-Semantic Residualization. In this method, the multi-rate\ndepth-wise dilated convolutions take a simpler role in feature extraction:\nperforming simple semantic-based morphological filtering with one desired\nreceptive field in the second step based on each concise feature map of region\nform provided by the first step, to improve their efficiency. Moreover, the\ndilation rates and the capacity of dilated convolutions for each network stage\nare elaborated to fully utilize all the feature maps of region form that can be\nachieved.Accordingly, we design a novel Dilation-wise Residual (DWR) module and\na Simple Inverted Residual (SIR) module for the high and low level network,\nrespectively, and form a powerful DWR Segmentation (DWRSeg) network. Extensive\nexperiments on the Cityscapes and CamVid datasets demonstrate the effectiveness\nof our method by achieving a state-of-the-art trade-off between accuracy and\ninference speed, in addition to being lighter weight. Without pretraining or\nresorting to any training trick, we achieve an mIoU of 72.7% on the Cityscapes\ntest set at a speed of 319.5 FPS on one NVIDIA GeForce GTX 1080 Ti card, which\nexceeds the latest methods of a speed of 69.5 FPS and 0.8% mIoU. The code and\ntrained models are publicly available.\n","authors":["Haoran Wei","Xu Liu","Shouchun Xu","Zhongjian Dai","Yaping Dai","Xiangyang Xu"],"pdf_url":"https://arxiv.org/pdf/2212.01173v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02982v3","updated":"2023-09-13T14:49:23Z","published":"2022-11-05T22:06:50Z","title":"Event and Entity Extraction from Generated Video Captions","summary":" Annotation of multimedia data by humans is time-consuming and costly, while\nreliable automatic generation of semantic metadata is a major challenge. We\npropose a framework to extract semantic metadata from automatically generated\nvideo captions. As metadata, we consider entities, the entities' properties,\nrelations between entities, and the video category. We employ two\nstate-of-the-art dense video captioning models with masked transformer (MT) and\nparallel decoding (PVDC) to generate captions for videos of the ActivityNet\nCaptions dataset. Our experiments show that it is possible to extract entities,\ntheir properties, relations between entities, and the video category from the\ngenerated captions. We observe that the quality of the extracted information is\nmainly influenced by the quality of the event localization in the video as well\nas the performance of the event caption generation.\n","authors":["Johannes Scherer","Ansgar Scherp","Deepayan Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2211.02982v3.pdf","comment":"Paper accepted at CD-MAKE 2023"},{"id":"http://arxiv.org/abs/2305.06695v3","updated":"2023-09-13T14:39:32Z","published":"2023-05-11T10:04:27Z","title":"Deep Visual-Genetic Biometrics for Taxonomic Classification of Rare\n Species","summary":" Visual as well as genetic biometrics are routinely employed to identify\nspecies and individuals in biological applications. However, no attempts have\nbeen made in this domain to computationally enhance visual classification of\nrare classes with little image data via genetics. In this paper, we thus\npropose aligned visual-genetic inference spaces with the aim to implicitly\nencode cross-domain associations for improved performance. We demonstrate for\nthe first time that such alignment can be achieved via deep embedding models\nand that the approach is directly applicable to boosting long-tailed\nrecognition (LTR) particularly for rare species. We experimentally demonstrate\nthe efficacy of the concept via application to microscopic imagery of 30k+\nplanktic foraminifer shells across 32 species when used together with\nindependent genetic data samples. Most importantly for practitioners, we show\nthat visual-genetic alignment can significantly benefit visual-only recognition\nof the rarest species. Technically, we pre-train a visual ResNet50 deep\nlearning model using triplet loss formulations to create an initial embedding\nspace. We re-structure this space based on genetic anchors embedded via a\nSequence Graph Transform (SGT) and linked to visual data by cross-domain cosine\nalignment. We show that an LTR approach improves the state-of-the-art across\nall benchmarks and that adding our visual-genetic alignment improves per-class\nand particularly rare tail class benchmarks significantly further. We conclude\nthat visual-genetic alignment can be a highly effective tool for complementing\nvisual biological data containing rare classes. The concept proposed may serve\nas an important future tool for integrating genetics and imageomics towards a\nmore complete scientific representation of taxonomic spaces and life itself.\nCode, weights, and data splits are published for full reproducibility.\n","authors":["Tayfun Karaderi","Tilo Burghardt","Raphael Morard","Daniela Schmidt"],"pdf_url":"https://arxiv.org/pdf/2305.06695v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09285v2","updated":"2023-09-13T14:27:42Z","published":"2023-08-18T04:05:18Z","title":"RFDforFin: Robust Deep Forgery Detection for GAN-generated Fingerprint\n Images","summary":" With the rapid development of the image generation technologies, the\nmalicious abuses of the GAN-generated fingerprint images poses a significant\nthreat to the public safety in certain circumstances. Although the existing\nuniversal deep forgery detection approach can be applied to detect the fake\nfingerprint images, they are easily attacked and have poor robustness.\nMeanwhile, there is no specifically designed deep forgery detection method for\nfingerprint images. In this paper, we propose the first deep forgery detection\napproach for fingerprint images, which combines unique ridge features of\nfingerprint and generation artifacts of the GAN-generated images, to the best\nof our knowledge. Specifically, we firstly construct a ridge stream, which\nexploits the grayscale variations along the ridges to extract unique\nfingerprint-specific features. Then, we construct a generation artifact stream,\nin which the FFT-based spectrums of the input fingerprint images are exploited,\nto extract more robust generation artifact features. At last, the unique ridge\nfeatures and generation artifact features are fused for binary classification\n(i.e., real or fake). Comprehensive experiments demonstrate that our proposed\napproach is effective and robust with low complexities.\n","authors":["Hui Miao","Yuanfang Guo","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2308.09285v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.06987v1","updated":"2023-09-13T14:26:03Z","published":"2023-09-13T14:26:03Z","title":"Instance Adaptive Prototypical Contrastive Embedding for Generalized\n Zero Shot Learning","summary":" Generalized zero-shot learning(GZSL) aims to classify samples from seen and\nunseen labels, assuming unseen labels are not accessible during training.\nRecent advancements in GZSL have been expedited by incorporating\ncontrastive-learning-based (instance-based) embedding in generative networks\nand leveraging the semantic relationship between data points. However, existing\nembedding architectures suffer from two limitations: (1) limited\ndiscriminability of synthetic features' embedding without considering\nfine-grained cluster structures; (2) inflexible optimization due to restricted\nscaling mechanisms on existing contrastive embedding networks, leading to\noverlapped representations in the embedding space. To enhance the quality of\nrepresentations in the embedding space, as mentioned in (1), we propose a\nmargin-based prototypical contrastive learning embedding network that reaps the\nbenefits of prototype-data (cluster quality enhancement) and implicit data-data\n(fine-grained representations) interaction while providing substantial cluster\nsupervision to the embedding network and the generator. To tackle (2), we\npropose an instance adaptive contrastive loss that leads to generalized\nrepresentations for unseen labels with increased inter-class margin. Through\ncomprehensive experimental evaluation, we show that our method can outperform\nthe current state-of-the-art on three benchmark datasets. Our approach also\nconsistently achieves the best unseen performance in the GZSL setting.\n","authors":["Riti Paul","Sahil Vora","Baoxin Li"],"pdf_url":"https://arxiv.org/pdf/2309.06987v1.pdf","comment":"7 pages, 4 figures. Accepted in IJCAI 2023 Workshop on Generalizing\n from Limited Resources in the Open World"},{"id":"http://arxiv.org/abs/2309.06978v1","updated":"2023-09-13T14:13:08Z","published":"2023-09-13T14:13:08Z","title":"Differentiable JPEG: The Devil is in the Details","summary":" JPEG remains one of the most widespread lossy image coding methods. However,\nthe non-differentiable nature of JPEG restricts the application in deep\nlearning pipelines. Several differentiable approximations of JPEG have recently\nbeen proposed to address this issue. This paper conducts a comprehensive review\nof existing diff. JPEG approaches and identifies critical details that have\nbeen missed by previous methods. To this end, we propose a novel diff. JPEG\napproach, overcoming previous limitations. Our approach is differentiable\nw.r.t. the input image, the JPEG quality, the quantization tables, and the\ncolor conversion parameters. We evaluate the forward and backward performance\nof our diff. JPEG approach against existing methods. Additionally, extensive\nablations are performed to evaluate crucial design choices. Our proposed diff.\nJPEG resembles the (non-diff.) reference implementation best, significantly\nsurpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For\nstrong compression rates, we can even improve PSNR by $9.51$dB. Strong\nadversarial attack results are yielded by our diff. JPEG, demonstrating the\neffective gradient approximation. Our code is available at\nhttps://github.com/necla-ml/Diff-JPEG.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.06978v1.pdf","comment":"Accepted at WACV 2024. Project page:\n https://christophreich1996.github.io/differentiable_jpeg/"},{"id":"http://arxiv.org/abs/2309.06961v1","updated":"2023-09-13T13:54:32Z","published":"2023-09-13T13:54:32Z","title":"Towards Reliable Dermatology Evaluation Benchmarks","summary":" Benchmark datasets for digital dermatology unwittingly contain inaccuracies\nthat reduce trust in model performance estimates. We propose a\nresource-efficient data cleaning protocol to identify issues that escaped\nprevious curation. The protocol leverages an existing algorithmic cleaning\nstrategy and is followed by a confirmation process terminated by an intuitive\nstopping criterion. Based on confirmation by multiple dermatologists, we remove\nirrelevant samples and near duplicates and estimate the percentage of label\nerrors in six dermatology image datasets for model evaluation promoted by the\nInternational Skin Imaging Collaboration. Along with this paper, we publish\nrevised file lists for each dataset which should be used for model evaluation.\nOur work paves the way for more trustworthy performance assessment in digital\ndermatology.\n","authors":["Fabian Gröger","Simone Lionetti","Philippe Gottfrois","Alvaro Gonzalez-Jimenez","Matthew Groh","Roxana Daneshjou","Labelling Consortium","Alexander A. Navarini","Marc Pouly"],"pdf_url":"https://arxiv.org/pdf/2309.06961v1.pdf","comment":"Link to the revised file lists:\n https://github.com/Digital-Dermatology/SelfClean-Revised-Benchmarks"},{"id":"http://arxiv.org/abs/2309.06958v1","updated":"2023-09-13T13:47:52Z","published":"2023-09-13T13:47:52Z","title":"Neural network-based coronary dominance classification of RCA angiograms","summary":" Background. Cardiac dominance classification is essential for SYNTAX score\nestimation, which is a tool used to determine the complexity of coronary artery\ndisease and guide patient selection toward optimal revascularization strategy.\nObjectives. Cardiac dominance classification algorithm based on the analysis of\nright coronary artery (RCA) angiograms using neural network Method. We employed\nconvolutional neural network ConvNext and Swin transformer for 2D image\n(frames) classification, along with a majority vote for cardio angiographic\nview classification. An auxiliary network was also used to detect irrelevant\nimages which were then excluded from the data set. Our data set consisted of\n828 angiographic studies, 192 of them being patients with left dominance.\nResults. 5-fold cross validation gave the following dominance classification\nmetrics (p=95%): macro recall=93.1%, accuracy=93.5%, macro F1=89.2%. The most\ncommon case in which the model regularly failed was RCA occlusion, as it\nrequires utilization of LCA information. Another cause for false prediction is\na small diameter combined with poor quality cardio angiographic view. In such\ncases, cardiac dominance classification can be complex and may require\ndiscussion among specialists to reach an accurate conclusion. Conclusion. The\nuse of machine learning approaches to classify cardiac dominance based on RCA\nalone has been shown to be successful with satisfactory accuracy. However, for\nhigher accuracy, it is necessary to utilize LCA information in the case of an\noccluded RCA and detect cases where there is high uncertainty.\n","authors":["Ivan Kruzhilov","Egor Ikryannikov","Artem Shadrin","Ruslan Utegenov","Galina Zubkova","Ivan Bessonov"],"pdf_url":"https://arxiv.org/pdf/2309.06958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06951v1","updated":"2023-09-13T13:34:22Z","published":"2023-09-13T13:34:22Z","title":"TransNet: A Transfer Learning-Based Network for Human Action Recognition","summary":" Human action recognition (HAR) is a high-level and significant research area\nin computer vision due to its ubiquitous applications. The main limitations of\nthe current HAR models are their complex structures and lengthy training time.\nIn this paper, we propose a simple yet versatile and effective end-to-end deep\nlearning architecture, coined as TransNet, for HAR. TransNet decomposes the\ncomplex 3D-CNNs into 2D- and 1D-CNNs, where the 2D- and 1D-CNN components\nextract spatial features and temporal patterns in videos, respectively.\nBenefiting from its concise architecture, TransNet is ideally compatible with\nany pretrained state-of-the-art 2D-CNN models in other fields, being\ntransferred to serve the HAR task. In other words, it naturally leverages the\npower and success of transfer learning for HAR, bringing huge advantages in\nterms of efficiency and effectiveness. Extensive experimental results and the\ncomparison with the state-of-the-art models demonstrate the superior\nperformance of the proposed TransNet in HAR in terms of flexibility, model\ncomplexity, training speed and classification accuracy.\n","authors":["K. Alomar","X. Cai"],"pdf_url":"https://arxiv.org/pdf/2309.06951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11909v4","updated":"2023-09-13T13:32:48Z","published":"2023-08-23T04:29:40Z","title":"Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data","summary":" Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial\ndependence between different brain regions, and the graph pooling operator in\nGCNs is key to enhancing the representation learning capability and acquiring\nabnormal brain maps. However, the majority of existing research designs graph\npooling operators only from the perspective of nodes while disregarding the\noriginal edge features, in a way that not only confines graph pooling\napplication scenarios, but also diminishes its ability to capture critical\nsubstructures. In this study, a clustering graph pooling method that first\nsupports multidimensional edge features, called Edge-aware hard clustering\ngraph pooling (EHCPool), is developed. EHCPool proposes the first\n'Edge-to-node' score evaluation criterion based on edge features to assess node\nfeature significance. To more effectively capture the critical subgraphs, a\nnovel Iteration n-top strategy is further designed to adaptively learn sparse\nhard clustering assignments for graphs. Subsequently, an innovative N-E\nAggregation strategy is presented to aggregate node and edge feature\ninformation in each independent subgraph. The proposed model was evaluated on\nmulti-site brain imaging public datasets and yielded state-of-the-art\nperformance. We believe this method is the first deep learning tool with the\npotential to probe different types of abnormal functional brain networks from\ndata-driven perspective. Core code is at: https://github.com/swfen/EHCPool.\n","authors":["Cheng Zhu","Jiayi Zhu","Lijuan Zhang","Xi Wu","Shuqi Yang","Ping Liang","Honghan Chen","Ying Tan"],"pdf_url":"https://arxiv.org/pdf/2308.11909v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06948v1","updated":"2023-09-13T13:31:00Z","published":"2023-09-13T13:31:00Z","title":"Limited-Angle Tomography Reconstruction via Deep End-To-End Learning on\n Synthetic Data","summary":" Computed tomography (CT) has become an essential part of modern science and\nmedicine. A CT scanner consists of an X-ray source that is spun around an\nobject of interest. On the opposite end of the X-ray source, a detector\ncaptures X-rays that are not absorbed by the object. The reconstruction of an\nimage is a linear inverse problem, which is usually solved by filtered back\nprojection. However, when the number of measurements is small, the\nreconstruction problem is ill-posed. This is for example the case when the\nX-ray source is not spun completely around the object, but rather irradiates\nthe object only from a limited angle. To tackle this problem, we present a deep\nneural network that is trained on a large amount of carefully-crafted synthetic\ndata and can perform limited-angle tomography reconstruction even for only\n30{\\deg} or 40{\\deg} sinograms. With our approach we won the first place in the\nHelsinki Tomography Challenge 2022.\n","authors":["Thomas Germer","Jan Robine","Sebastian Konietzny","Stefan Harmeling","Tobias Uelwer"],"pdf_url":"https://arxiv.org/pdf/2309.06948v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06941v1","updated":"2023-09-13T13:24:27Z","published":"2023-09-13T13:24:27Z","title":"DEFormer: DCT-driven Enhancement Transformer for Low-light Image and\n Dark Vision","summary":" The goal of low-light image enhancement is to restore the color and details\nof the image and is of great significance for high-level visual tasks in\nautonomous driving. However, it is difficult to restore the lost details in the\ndark area by relying only on the RGB domain. In this paper we introduce\nfrequency as a new clue into the network and propose a novel DCT-driven\nenhancement transformer (DEFormer). First, we propose a learnable frequency\nbranch (LFB) for frequency enhancement contains DCT processing and\ncurvature-based frequency enhancement (CFE). CFE calculates the curvature of\neach channel to represent the detail richness of different frequency bands,\nthen we divides the frequency features, which focuses on frequency bands with\nricher textures. In addition, we propose a cross domain fusion (CDF) for\nreducing the differences between the RGB domain and the frequency domain. We\nalso adopt DEFormer as a preprocessing in dark detection, DEFormer effectively\nimproves the performance of the detector, bringing 2.1% and 3.4% improvement in\nExDark and DARK FACE datasets on mAP respectively.\n","authors":["Xiangchen Yin","Zhenda Yu","Xin Gao","Ran Ju","Xiao Sun","Xinyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.06941v1.pdf","comment":"submit to ICRA2024"},{"id":"http://arxiv.org/abs/2309.06933v1","updated":"2023-09-13T13:13:29Z","published":"2023-09-13T13:13:29Z","title":"DreamStyler: Paint by Style Inversion with Text-to-Image Diffusion\n Models","summary":" Recent progresses in large-scale text-to-image models have yielded remarkable\naccomplishments, finding various applications in art domain. However,\nexpressing unique characteristics of an artwork (e.g. brushwork, colortone, or\ncomposition) with text prompts alone may encounter limitations due to the\ninherent constraints of verbal description. To this end, we introduce\nDreamStyler, a novel framework designed for artistic image synthesis,\nproficient in both text-to-image synthesis and style transfer. DreamStyler\noptimizes a multi-stage textual embedding with a context-aware text prompt,\nresulting in prominent image quality. In addition, with content and style\nguidance, DreamStyler exhibits flexibility to accommodate a range of style\nreferences. Experimental results demonstrate its superior performance across\nmultiple scenarios, suggesting its promising potential in artistic product\ncreation.\n","authors":["Namhyuk Ahn","Junsoo Lee","Chunggi Lee","Kunhee Kim","Daesik Kim","Seung-Hun Nam","Kibeom Hong"],"pdf_url":"https://arxiv.org/pdf/2309.06933v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06928v1","updated":"2023-09-13T12:58:09Z","published":"2023-09-13T12:58:09Z","title":"Dynamic Causal Disentanglement Model for Dialogue Emotion Detection","summary":" Emotion detection is a critical technology extensively employed in diverse\nfields. While the incorporation of commonsense knowledge has proven beneficial\nfor existing emotion detection methods, dialogue-based emotion detection\nencounters numerous difficulties and challenges due to human agency and the\nvariability of dialogue content.In dialogues, human emotions tend to accumulate\nin bursts. However, they are often implicitly expressed. This implies that many\ngenuine emotions remain concealed within a plethora of unrelated words and\ndialogues.In this paper, we propose a Dynamic Causal Disentanglement Model\nbased on hidden variable separation, which is founded on the separation of\nhidden variables. This model effectively decomposes the content of dialogues\nand investigates the temporal accumulation of emotions, thereby enabling more\nprecise emotion recognition. First, we introduce a novel Causal Directed\nAcyclic Graph (DAG) to establish the correlation between hidden emotional\ninformation and other observed elements. Subsequently, our approach utilizes\npre-extracted personal attributes and utterance topics as guiding factors for\nthe distribution of hidden variables, aiming to separate irrelevant ones.\nSpecifically, we propose a dynamic temporal disentanglement model to infer the\npropagation of utterances and hidden variables, enabling the accumulation of\nemotion-related information throughout the conversation. To guide this\ndisentanglement process, we leverage the ChatGPT-4.0 and LSTM networks to\nextract utterance topics and personal attributes as observed\ninformation.Finally, we test our approach on two popular datasets in dialogue\nemotion detection and relevant experimental results verified the model's\nsuperiority.\n","authors":["Yuting Su","Yichen Wei","Weizhi Nie","Sicheng Zhao","Anan Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06928v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06924v1","updated":"2023-09-13T12:50:21Z","published":"2023-09-13T12:50:21Z","title":"Contrast-Phys+: Unsupervised and Weakly-supervised Video-based Remote\n Physiological Measurement via Spatiotemporal Contrast","summary":" Video-based remote physiological measurement utilizes facial videos to\nmeasure the blood volume change signal, which is also called remote\nphotoplethysmography (rPPG). Supervised methods for rPPG measurements have been\nshown to achieve good performance. However, the drawback of these methods is\nthat they require facial videos with ground truth (GT) physiological signals,\nwhich are often costly and difficult to obtain. In this paper, we propose\nContrast-Phys+, a method that can be trained in both unsupervised and\nweakly-supervised settings. We employ a 3DCNN model to generate multiple\nspatiotemporal rPPG signals and incorporate prior knowledge of rPPG into a\ncontrastive loss function. We further incorporate the GT signals into\ncontrastive learning to adapt to partial or misaligned labels. The contrastive\nloss encourages rPPG/GT signals from the same video to be grouped together,\nwhile pushing those from different videos apart. We evaluate our methods on\nfive publicly available datasets that include both RGB and Near-infrared\nvideos. Contrast-Phys+ outperforms the state-of-the-art supervised methods,\neven when using partially available or misaligned GT signals, or no labels at\nall. Additionally, we highlight the advantages of our methods in terms of\ncomputational efficiency, noise robustness, and generalization.\n","authors":["Zhaodong Sun","Xiaobai Li"],"pdf_url":"https://arxiv.org/pdf/2309.06924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06922v1","updated":"2023-09-13T12:46:06Z","published":"2023-09-13T12:46:06Z","title":"Hydra: Multi-head Low-rank Adaptation for Parameter Efficient\n Fine-tuning","summary":" The recent surge in large-scale foundation models has spurred the development\nof efficient methods for adapting these models to various downstream tasks.\nLow-rank adaptation methods, such as LoRA, have gained significant attention\ndue to their outstanding parameter efficiency and no additional inference\nlatency. This paper investigates a more general form of adapter module based on\nthe analysis that parallel and sequential adaptation branches learn novel and\ngeneral features during fine-tuning, respectively. The proposed method, named\nHydra, due to its multi-head computational branches, combines parallel and\nsequential branch to integrate capabilities, which is more expressive than\nexisting single branch methods and enables the exploration of a broader range\nof optimal points in the fine-tuning process. In addition, the proposed\nadaptation method explicitly leverages the pre-trained weights by performing a\nlinear combination of the pre-trained features. It allows the learned features\nto have better generalization performance across diverse downstream tasks.\nFurthermore, we perform a comprehensive analysis of the characteristics of each\nadaptation branch with empirical evidence. Through an extensive range of\nexperiments, encompassing comparisons and ablation studies, we substantiate the\nefficiency and demonstrate the superior performance of Hydra. This\ncomprehensive evaluation underscores the potential impact and effectiveness of\nHydra in a variety of applications. Our code is available on\n\\url{https://github.com/extremebird/Hydra}\n","authors":["Sanghyeon Kim","Hyunmo Yang","Younghyun Kim","Youngjoon Hong","Eunbyung Park"],"pdf_url":"https://arxiv.org/pdf/2309.06922v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.10865v3","updated":"2023-09-13T12:37:51Z","published":"2022-11-20T04:21:42Z","title":"IC3D: Image-Conditioned 3D Diffusion for Shape Generation","summary":" In recent years, Denoising Diffusion Probabilistic Models (DDPMs) have\ndemonstrated exceptional performance in various 2D generative tasks. Following\nthis success, DDPMs have been extended to 3D shape generation, surpassing\nprevious methodologies in this domain. While many of these models are\nunconditional, some have explored the potential of using guidance from\ndifferent modalities. In particular, image guidance for 3D generation has been\nexplored through the utilization of CLIP embeddings. However, these embeddings\nare designed to align images and text, and do not necessarily capture the\nspecific details needed for shape generation. To address this limitation and\nenhance image-guided 3D DDPMs with augmented 3D understanding, we introduce\nCISP (Contrastive Image-Shape Pre-training), obtaining a well-structured\nimage-shape joint embedding space. Building upon CISP, we then introduce IC3D,\na DDPM that harnesses CISP's guidance for 3D shape generation from single-view\nimages. This generative diffusion model outperforms existing benchmarks in both\nquality and diversity of generated 3D shapes. Moreover, despite IC3D's\ngenerative nature, its generated shapes are preferred by human evaluators over\na competitive single-view 3D reconstruction model. These properties contribute\nto a coherent embedding space, enabling latent interpolation and conditioned\ngeneration also from out-of-distribution images. We find IC3D able to generate\ncoherent and diverse completions also when presented with occluded views,\nrendering it applicable in controlled real-world scenarios.\n","authors":["Cristian Sbrolli","Paolo Cudrano","Matteo Frosi","Matteo Matteucci"],"pdf_url":"https://arxiv.org/pdf/2211.10865v3.pdf","comment":"9 pages, 10 figures; appendix 6 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.06902v1","updated":"2023-09-13T12:00:33Z","published":"2023-09-13T12:00:33Z","title":"CCSPNet-Joint: Efficient Joint Training Method for Traffic Sihn\n Detection Under Extreme Conditions","summary":" Traffic sign detection is an important research direction in intelligent\ndriving. Unfortunately, existing methods often overlook extreme conditions such\nas fog, rain, and motion blur. Moreover, the end-to-end training strategy for\nimage denoising and object detection models fails to utilize inter-model\ninformation effectively. To address these issues, we propose CCSPNet, an\nefficient feature extraction module based on Transformers and CNNs, which\neffectively leverages contextual information, achieves faster inference speed\nand provides stronger feature enhancement capabilities. Furthermore, we\nestablish the correlation between object detection and image denoising tasks\nand propose a joint training model, CCSPNet-Joint, to improve data efficiency\nand generalization. Finally, to validate our approach, we create the CCTSDB-AUG\ndataset for traffic sign detection in extreme scenarios. Extensive experiments\nhave shown that CCSPNet achieves state-of-the-art performance in traffic sign\ndetection under extreme conditions. Compared to end-to-end methods,\nCCSPNet-Joint achieves a 5.32% improvement in precision and an 18.09%\nimprovement in mAP@.5.\n","authors":["Haoqin Hong","Yue Zhou","Xiangyu Shu","Xiangfang Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15701v2","updated":"2023-09-13T11:52:30Z","published":"2023-05-25T04:19:14Z","title":"Action Sensitivity Learning for Temporal Action Localization","summary":" Temporal action localization (TAL), which involves recognizing and locating\naction instances, is a challenging task in video understanding. Most existing\napproaches directly predict action classes and regress offsets to boundaries,\nwhile overlooking the discrepant importance of each frame. In this paper, we\npropose an Action Sensitivity Learning framework (ASL) to tackle this task,\nwhich aims to assess the value of each frame and then leverage the generated\naction sensitivity to recalibrate the training procedure. We first introduce a\nlightweight Action Sensitivity Evaluator to learn the action sensitivity at the\nclass level and instance level, respectively. The outputs of the two branches\nare combined to reweight the gradient of the two sub-tasks. Moreover, based on\nthe action sensitivity of each frame, we design an Action Sensitive Contrastive\nLoss to enhance features, where the action-aware frames are sampled as positive\npairs to push away the action-irrelevant frames. The extensive studies on\nvarious action localization benchmarks (i.e., MultiThumos, Charades,\nEgo4D-Moment Queries v1.0, Epic-Kitchens 100, Thumos14 and ActivityNet1.3) show\nthat ASL surpasses the state-of-the-art in terms of average-mAP under multiple\ntypes of scenarios, e.g., single-labeled, densely-labeled and egocentric.\n","authors":["Jiayi Shao","Xiaohan Wang","Ruijie Quan","Junjun Zheng","Jiang Yang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.15701v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.06895v1","updated":"2023-09-13T11:37:04Z","published":"2023-09-13T11:37:04Z","title":"MagiCapture: High-Resolution Multi-Concept Portrait Customization","summary":" Large-scale text-to-image models including Stable Diffusion are capable of\ngenerating high-fidelity photorealistic portrait images. There is an active\nresearch area dedicated to personalizing these models, aiming to synthesize\nspecific subjects or styles using provided sets of reference images. However,\ndespite the plausible results from these personalization methods, they tend to\nproduce images that often fall short of realism and are not yet on a\ncommercially viable level. This is particularly noticeable in portrait image\ngeneration, where any unnatural artifact in human faces is easily discernible\ndue to our inherent human bias. To address this, we introduce MagiCapture, a\npersonalization method for integrating subject and style concepts to generate\nhigh-resolution portrait images using just a few subject and style references.\nFor instance, given a handful of random selfies, our fine-tuned model can\ngenerate high-quality portrait images in specific styles, such as passport or\nprofile photos. The main challenge with this task is the absence of ground\ntruth for the composed concepts, leading to a reduction in the quality of the\nfinal output and an identity shift of the source subject. To address these\nissues, we present a novel Attention Refocusing loss coupled with auxiliary\npriors, both of which facilitate robust learning within this weakly supervised\nlearning setting. Our pipeline also includes additional post-processing steps\nto ensure the creation of highly realistic outputs. MagiCapture outperforms\nother baselines in both quantitative and qualitative evaluations and can also\nbe generalized to other non-human objects.\n","authors":["Junha Hyung","Jaeyo Shin","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2309.06895v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.06891v1","updated":"2023-09-13T11:28:27Z","published":"2023-09-13T11:28:27Z","title":"Keep It SimPool: Who Said Supervised Transformers Suffer from Attention\n Deficit?","summary":" Convolutional networks and vision transformers have different forms of\npairwise interactions, pooling across layers and pooling at the end of the\nnetwork. Does the latter really need to be different? As a by-product of\npooling, vision transformers provide spatial attention for free, but this is\nmost often of low quality unless self-supervised, which is not well studied. Is\nsupervision really the problem?\n In this work, we develop a generic pooling framework and then we formulate a\nnumber of existing methods as instantiations. By discussing the properties of\neach group of methods, we derive SimPool, a simple attention-based pooling\nmechanism as a replacement of the default one for both convolutional and\ntransformer encoders. We find that, whether supervised or self-supervised, this\nimproves performance on pre-training and downstream tasks and provides\nattention maps delineating object boundaries in all cases. One could thus call\nSimPool universal. To our knowledge, we are the first to obtain attention maps\nin supervised transformers of at least as good quality as self-supervised,\nwithout explicit losses or modifying the architecture. Code at:\nhttps://github.com/billpsomas/simpool.\n","authors":["Bill Psomas","Ioannis Kakogeorgiou","Konstantinos Karantzalos","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2309.06891v1.pdf","comment":"ICCV 2023. Code and models: https://github.com/billpsomas/simpool"},{"id":"http://arxiv.org/abs/2309.06884v1","updated":"2023-09-13T11:18:15Z","published":"2023-09-13T11:18:15Z","title":"Manufacturing Quality Control with Autoencoder-Based Defect Localization\n and Unsupervised Class Selection","summary":" Manufacturing industries require efficient and voluminous production of\nhigh-quality finished goods. In the context of Industry 4.0, visual anomaly\ndetection poses an optimistic solution for automatically controlling product\nquality with high precision. Automation based on computer vision poses a\npromising solution to prevent bottlenecks at the product quality checkpoint. We\nconsidered recent advancements in machine learning to improve visual defect\nlocalization, but challenges persist in obtaining a balanced feature set and\ndatabase of the wide variety of defects occurring in the production line. This\npaper proposes a defect localizing autoencoder with unsupervised class\nselection by clustering with k-means the features extracted from a pre-trained\nVGG-16 network. The selected classes of defects are augmented with natural wild\ntextures to simulate artificial defects. The study demonstrates the\neffectiveness of the defect localizing autoencoder with unsupervised class\nselection for improving defect detection in manufacturing industries. The\nproposed methodology shows promising results with precise and accurate\nlocalization of quality defects on melamine-faced boards for the furniture\nindustry. Incorporating artificial defects into the training data shows\nsignificant potential for practical implementation in real-world quality\ncontrol scenarios.\n","authors":["Devang Mehta","Noah Klarmann"],"pdf_url":"https://arxiv.org/pdf/2309.06884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06882v1","updated":"2023-09-13T11:16:52Z","published":"2023-09-13T11:16:52Z","title":"ProMap: Datasets for Product Mapping in E-commerce","summary":" The goal of product mapping is to decide, whether two listings from two\ndifferent e-shops describe the same products. Existing datasets of matching and\nnon-matching pairs of products, however, often suffer from incomplete product\ninformation or contain only very distant non-matching products. Therefore,\nwhile predictive models trained on these datasets achieve good results on them,\nin practice, they are unusable as they cannot distinguish very similar but\nnon-matching pairs of products. This paper introduces two new datasets for\nproduct mapping: ProMapCz consisting of 1,495 Czech product pairs and ProMapEn\nconsisting of 1,555 English product pairs of matching and non-matching products\nmanually scraped from two pairs of e-shops. The datasets contain both images\nand textual descriptions of the products, including their specifications,\nmaking them one of the most complete datasets for product mapping.\nAdditionally, the non-matching products were selected in two phases, creating\ntwo types of non-matches -- close non-matches and medium non-matches. Even the\nmedium non-matches are pairs of products that are much more similar than\nnon-matches in other datasets -- for example, they still need to have the same\nbrand and similar name and price. After simple data preprocessing, several\nmachine learning algorithms were trained on these and two the other datasets to\ndemonstrate the complexity and completeness of ProMap datasets. ProMap datasets\nare presented as a golden standard for further research of product mapping\nfilling the gaps in existing ones.\n","authors":["Kateřina Macková","Martin Pilát"],"pdf_url":"https://arxiv.org/pdf/2309.06882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16846v2","updated":"2023-09-13T11:11:34Z","published":"2023-06-29T10:37:29Z","title":"Degree-Controllable Lightweight Fast Style Transfer with Detail\n Attention-enhanced","summary":" Style transfer methods usually use pre-trained VGG or more complex models as\nencoders to achieve better effects. This leads to extremely slow processing of\nhigh-resolution images. To solve the problem, we propose an degree-controllable\ndetail attention-enhanced lightweight fast style transfer (DcDaeLFST), which\nadopts a small, shallow, and compact architecture for efficient forward\ninference. Additionally, our exploit a global semantic invariance loss to\npreserve the semantic and structural information of content images, and a local\ndetail attention-enhanced module to preserve the detail information of them,\ntogether with a style discriminator. Despite limited parameters, it can achieve\noverall better style matching performance. Most importantly, it is the first\nmethod that can control the degree of detail retention and style transfer based\non subjective evaluation. In comparative experiments, our model is 17-250 times\nsmaller and 0.26-6.5 times faster than other state-of-the-art models, with the\nfastest processing speed of 0.38s on 4K high-resolution images.\n","authors":["Jiang Shi Qi"],"pdf_url":"https://arxiv.org/pdf/2306.16846v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06877v1","updated":"2023-09-13T10:53:12Z","published":"2023-09-13T10:53:12Z","title":"Video Infringement Detection via Feature Disentanglement and Mutual\n Information Maximization","summary":" The self-media era provides us tremendous high quality videos. Unfortunately,\nfrequent video copyright infringements are now seriously damaging the interests\nand enthusiasm of video creators. Identifying infringing videos is therefore a\ncompelling task. Current state-of-the-art methods tend to simply feed\nhigh-dimensional mixed video features into deep neural networks and count on\nthe networks to extract useful representations. Despite its simplicity, this\nparadigm heavily relies on the original entangled features and lacks\nconstraints guaranteeing that useful task-relevant semantics are extracted from\nthe features.\n In this paper, we seek to tackle the above challenges from two aspects: (1)\nWe propose to disentangle an original high-dimensional feature into multiple\nsub-features, explicitly disentangling the feature into exclusive\nlower-dimensional components. We expect the sub-features to encode\nnon-overlapping semantics of the original feature and remove redundant\ninformation.\n (2) On top of the disentangled sub-features, we further learn an auxiliary\nfeature to enhance the sub-features. We theoretically analyzed the mutual\ninformation between the label and the disentangled features, arriving at a loss\nthat maximizes the extraction of task-relevant information from the original\nfeature.\n Extensive experiments on two large-scale benchmark datasets (i.e., SVD and\nVCSL) demonstrate that our method achieves 90.1% TOP-100 mAP on the large-scale\nSVD dataset and also sets the new state-of-the-art on the VCSL benchmark\ndataset. Our code and model have been released at\nhttps://github.com/yyyooooo/DMI/, hoping to contribute to the community.\n","authors":["Zhenguang Liu","Xinyang Yu","Ruili Wang","Shuai Ye","Zhe Ma","Jianfeng Dong","Sifeng He","Feng Qian","Xiaobo Zhang","Roger Zimmermann","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06877v1.pdf","comment":"This paper is accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.05282v2","updated":"2023-09-13T10:38:28Z","published":"2023-09-11T07:37:10Z","title":"Can you text what is happening? Integrating pre-trained language\n encoders into trajectory prediction models for autonomous driving","summary":" In autonomous driving tasks, scene understanding is the first step towards\npredicting the future behavior of the surrounding traffic participants. Yet,\nhow to represent a given scene and extract its features are still open research\nquestions. In this study, we propose a novel text-based representation of\ntraffic scenes and process it with a pre-trained language encoder.\n First, we show that text-based representations, combined with classical\nrasterized image representations, lead to descriptive scene embeddings. Second,\nwe benchmark our predictions on the nuScenes dataset and show significant\nimprovements compared to baselines. Third, we show in an ablation study that a\njoint encoder of text and rasterized images outperforms the individual encoders\nconfirming that both representations have their complementary strengths.\n","authors":["Ali Keysan","Andreas Look","Eitan Kosman","Gonca Gürsun","Jörg Wagner","Yu Yao","Barbara Rakitsch"],"pdf_url":"https://arxiv.org/pdf/2309.05282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06828v1","updated":"2023-09-13T09:22:49Z","published":"2023-09-13T09:22:49Z","title":"UniBrain: Universal Brain MRI Diagnosis with Hierarchical\n Knowledge-enhanced Pre-training","summary":" Magnetic resonance imaging~(MRI) have played a crucial role in brain disease\ndiagnosis, with which a range of computer-aided artificial intelligence methods\nhave been proposed. However, the early explorations usually focus on the\nlimited types of brain diseases in one study and train the model on the data in\na small scale, yielding the bottleneck of generalization. Towards a more\neffective and scalable paradigm, we propose a hierarchical knowledge-enhanced\npre-training framework for the universal brain MRI diagnosis, termed as\nUniBrain. Specifically, UniBrain leverages a large-scale dataset of 24,770\nimaging-report pairs from routine diagnostics. Different from previous\npre-training techniques for the unitary vision or textual feature, or with the\nbrute-force alignment between vision and language information, we leverage the\nunique characteristic of report information in different granularity to build a\nhierarchical alignment mechanism, which strengthens the efficiency in feature\nlearning. Our UniBrain is validated on three real world datasets with severe\nclass imbalance and the public BraTS2019 dataset. It not only consistently\noutperforms all state-of-the-art diagnostic methods by a large margin and\nprovides a superior grounding performance but also shows comparable performance\ncompared to expert radiologists on certain disease types.\n","authors":["Jiayu Lei","Lisong Dai","Haoyun Jiang","Chaoyi Wu","Xiaoman Zhang","Yao Zhang","Jiangchao Yao","Weidi Xie","Yanyong Zhang","Yuehua Li","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06825v1","updated":"2023-09-13T09:16:19Z","published":"2023-09-13T09:16:19Z","title":"Topology-inspired Cross-domain Network for Developmental Cervical\n Stenosis Quantification","summary":" Developmental Canal Stenosis (DCS) quantification is crucial in cervical\nspondylosis screening. Compared with quantifying DCS manually, a more efficient\nand time-saving manner is provided by deep keypoint localization networks,\nwhich can be implemented in either the coordinate or the image domain. However,\nthe vertebral visualization features often lead to abnormal topological\nstructures during keypoint localization, including keypoint distortion with\nedges and weakly connected structures, which cannot be fully suppressed in\neither the coordinate or image domain alone. To overcome this limitation, a\nkeypoint-edge and a reparameterization modules are utilized to restrict these\nabnormal structures in a cross-domain manner. The keypoint-edge constraint\nmodule restricts the keypoints on the edges of vertebrae, which ensures that\nthe distribution pattern of keypoint coordinates is consistent with those for\nDCS quantification. And the reparameterization module constrains the weakly\nconnected structures in image-domain heatmaps with coordinates combined.\nMoreover, the cross-domain network improves spatial generalization by utilizing\nheatmaps and incorporating coordinates for accurate localization, which avoids\nthe trade-off between these two properties in an individual domain.\nComprehensive results of distinct quantification tasks show the superiority and\ngenerability of the proposed Topology-inspired Cross-domain Network (TCN)\ncompared with other competing localization methods.\n","authors":["Zhenxi Zhang","Yanyang Wang","Yao Wu","Weifei Wu"],"pdf_url":"https://arxiv.org/pdf/2309.06825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06824v1","updated":"2023-09-13T09:15:20Z","published":"2023-09-13T09:15:20Z","title":"SAMUS: Adapting Segment Anything Model for Clinically-Friendly and\n Generalizable Ultrasound Image Segmentation","summary":" Segment anything model (SAM), an eminent universal image segmentation model,\nhas recently gathered considerable attention within the domain of medical image\nsegmentation. Despite the remarkable performance of SAM on natural images, it\ngrapples with significant performance degradation and limited generalization\nwhen confronted with medical images, particularly with those involving objects\nof low contrast, faint boundaries, intricate shapes, and diminutive sizes. In\nthis paper, we propose SAMUS, a universal model tailored for ultrasound image\nsegmentation. In contrast to previous SAM-based universal models, SAMUS pursues\nnot only better generalization but also lower deployment cost, rendering it\nmore suitable for clinical applications. Specifically, based on SAM, a parallel\nCNN branch is introduced to inject local features into the ViT encoder through\ncross-branch attention for better medical image segmentation. Then, a position\nadapter and a feature adapter are developed to adapt SAM from natural to\nmedical domains and from requiring large-size inputs (1024x1024) to small-size\ninputs (256x256) for more clinical-friendly deployment. A comprehensive\nultrasound dataset, comprising about 30k images and 69k masks and covering six\nobject categories, is collected for verification. Extensive comparison\nexperiments demonstrate SAMUS's superiority against the state-of-the-art\ntask-specific models and universal foundation models under both task-specific\nevaluation and generalization evaluation. Moreover, SAMUS is deployable on\nentry-level GPUs, as it has been liberated from the constraints of long\nsequence encoding. The code, data, and models will be released at\nhttps://github.com/xianlin7/SAMUS.\n","authors":["Xian Lin","Yangyang Xiang","Li Zhang","Xin Yang","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2309.06824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16612v2","updated":"2023-09-13T09:11:26Z","published":"2023-08-31T10:19:23Z","title":"Neural Gradient Regularizer","summary":" Owing to its significant success, the prior imposed on gradient maps has\nconsistently been a subject of great interest in the field of image processing.\nTotal variation (TV), one of the most representative regularizers, is known for\nits ability to capture the intrinsic sparsity prior underlying gradient maps.\nNonetheless, TV and its variants often underestimate the gradient maps, leading\nto the weakening of edges and details whose gradients should not be zero in the\noriginal image (i.e., image structures is not describable by sparse priors of\ngradient maps). Recently, total deep variation (TDV) has been introduced,\nassuming the sparsity of feature maps, which provides a flexible regularization\nlearned from large-scale datasets for a specific task. However, TDV requires to\nretrain the network with image/task variations, limiting its versatility. To\nalleviate this issue, in this paper, we propose a neural gradient regularizer\n(NGR) that expresses the gradient map as the output of a neural network. Unlike\nexisting methods, NGR does not rely on any subjective sparsity or other prior\nassumptions on image gradient maps, thereby avoiding the underestimation of\ngradient maps. NGR is applicable to various image types and different image\nprocessing tasks, functioning in a zero-shot learning fashion, making it a\nversatile and plug-and-play regularizer. Extensive experimental results\ndemonstrate the superior performance of NGR over state-of-the-art counterparts\nfor a range of different tasks, further validating its effectiveness and\nversatility.\n","authors":["Shuang Xu","Yifan Wang","Zixiang Zhao","Jiangjun Peng","Xiangyong Cao","Deyu Meng","Yulun Zhang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2308.16612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06819v1","updated":"2023-09-13T09:07:42Z","published":"2023-09-13T09:07:42Z","title":"Tracking Particles Ejected From Active Asteroid Bennu With Event-Based\n Vision","summary":" Early detection and tracking of ejecta in the vicinity of small solar system\nbodies is crucial to guarantee spacecraft safety and support scientific\nobservation. During the visit of active asteroid Bennu, the OSIRIS-REx\nspacecraft relied on the analysis of images captured by onboard navigation\ncameras to detect particle ejection events, which ultimately became one of the\nmission's scientific highlights. To increase the scientific return of similar\ntime-constrained missions, this work proposes an event-based solution that is\ndedicated to the detection and tracking of centimetre-sized particles. Unlike a\nstandard frame-based camera, the pixels of an event-based camera independently\ntrigger events indicating whether the scene brightness has increased or\ndecreased at that time and location in the sensor plane. As a result of the\nsparse and asynchronous spatiotemporal output, event cameras combine very high\ndynamic range and temporal resolution with low-power consumption, which could\ncomplement existing onboard imaging techniques. This paper motivates the use of\na scientific event camera by reconstructing the particle ejection episodes\nreported by the OSIRIS-REx mission in a photorealistic scene generator and in\nturn, simulating event-based observations. The resulting streams of\nspatiotemporal data support future work on event-based multi-object tracking.\n","authors":["Loïc J. Azzalini","Dario Izzo"],"pdf_url":"https://arxiv.org/pdf/2309.06819v1.pdf","comment":"6 pages, 3 figures, presented at the XXVII Italian Association of\n Aeronautics and Astronautics (AIDAA) Congress, 4-7 September 2023, Padova\n Italy"},{"id":"http://arxiv.org/abs/2309.06810v1","updated":"2023-09-13T09:00:45Z","published":"2023-09-13T09:00:45Z","title":"Leveraging SE(3) Equivariance for Learning 3D Geometric Shape Assembly","summary":" Shape assembly aims to reassemble parts (or fragments) into a complete\nobject, which is a common task in our daily life. Different from the semantic\npart assembly (e.g., assembling a chair's semantic parts like legs into a whole\nchair), geometric part assembly (e.g., assembling bowl fragments into a\ncomplete bowl) is an emerging task in computer vision and robotics. Instead of\nsemantic information, this task focuses on geometric information of parts. As\nthe both geometric and pose space of fractured parts are exceptionally large,\nshape pose disentanglement of part representations is beneficial to geometric\nshape assembly. In our paper, we propose to leverage SE(3) equivariance for\nsuch shape pose disentanglement. Moreover, while previous works in vision and\nrobotics only consider SE(3) equivariance for the representations of single\nobjects, we move a step forward and propose leveraging SE(3) equivariance for\nrepresentations considering multi-part correlations, which further boosts the\nperformance of the multi-part assembly. Experiments demonstrate the\nsignificance of SE(3) equivariance and our proposed method for geometric shape\nassembly. Project page: https://crtie.github.io/SE-3-part-assembly/\n","authors":["Ruihai Wu","Chenrui Tie","Yushi Du","Yan Zhao","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.06810v1.pdf","comment":"ICCV 2023, Project page: https://crtie.github.io/SE-3-part-assembly/\n , Code:\n https://github.com/crtie/Leveraging-SE-3-Equivariance-for-Learning-3D-Geometric-Shape-Assembly"},{"id":"http://arxiv.org/abs/2309.06809v1","updated":"2023-09-13T08:59:54Z","published":"2023-09-13T08:59:54Z","title":"TAP: Targeted Prompting for Task Adaptive Generation of Textual Training\n Instances for Visual Classification","summary":" Vision and Language Models (VLMs), such as CLIP, have enabled visual\nrecognition of a potentially unlimited set of categories described by text\nprompts. However, for the best visual recognition performance, these models\nstill require tuning to better fit the data distributions of the downstream\ntasks, in order to overcome the domain shift from the web-based pre-training\ndata. Recently, it has been shown that it is possible to effectively tune VLMs\nwithout any paired data, and in particular to effectively improve VLMs visual\nrecognition performance using text-only training data generated by Large\nLanguage Models (LLMs). In this paper, we dive deeper into this exciting\ntext-only VLM training approach and explore ways it can be significantly\nfurther improved taking the specifics of the downstream task into account when\nsampling text data from LLMs. In particular, compared to the SOTA text-only VLM\ntraining approach, we demonstrate up to 8.4% performance improvement in (cross)\ndomain-specific adaptation, up to 8.7% improvement in fine-grained recognition,\nand 3.1% overall average improvement in zero-shot classification compared to\nstrong baselines.\n","authors":["M. Jehanzeb Mirza","Leonid Karlinsky","Wei Lin","Horst Possegger","Rogerio Feris","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2309.06809v1.pdf","comment":"Code is available at: https://github.com/jmiemirza/TAP"},{"id":"http://arxiv.org/abs/2309.06807v1","updated":"2023-09-13T08:54:22Z","published":"2023-09-13T08:54:22Z","title":"Bayesian uncertainty-weighted loss for improved generalisability on\n polyp segmentation task","summary":" While several previous studies have devised methods for segmentation of\npolyps, most of these methods are not rigorously assessed on multi-center\ndatasets. Variability due to appearance of polyps from one center to another,\ndifference in endoscopic instrument grades, and acquisition quality result in\nmethods with good performance on in-distribution test data, and poor\nperformance on out-of-distribution or underrepresented samples. Unfair models\nhave serious implications and pose a critical challenge to clinical\napplications. We adapt an implicit bias mitigation method which leverages\nBayesian epistemic uncertainties during training to encourage the model to\nfocus on underrepresented sample regions. We demonstrate the potential of this\napproach to improve generalisability without sacrificing state-of-the-art\nperformance on a challenging multi-center polyp segmentation dataset (PolypGen)\nwith different centers and image modalities.\n","authors":["Rebecca S. Stone","Pedro E. Chavarrias-Solano","Andrew J. Bulpitt","David C. Hogg","Sharib Ali"],"pdf_url":"https://arxiv.org/pdf/2309.06807v1.pdf","comment":"To be presented at the Fairness of AI in Medical Imaging (FAIMI)\n MICCAI 2023 Workshop and published in volumes of the Springer Lecture Notes\n Computer Science (LNCS) series"},{"id":"http://arxiv.org/abs/2308.00507v2","updated":"2023-09-13T08:51:58Z","published":"2023-08-01T12:46:02Z","title":"Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT\n by Integrating Neural Distance and Texture-Aware Transformer","summary":" Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which\nthe tumor-vascular involvement greatly affects the resectability and, thus,\noverall survival of patients. However, current prognostic prediction methods\nfail to explicitly and accurately investigate relationships between the tumor\nand nearby important vessels. This paper proposes a novel learnable neural\ndistance that describes the precise relationship between the tumor and vessels\nin CT images of different patients, adopting it as a major feature for\nprognosis prediction. Besides, different from existing models that used CNNs or\nLSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT\nimaging, we improved the extraction of dynamic tumor-related texture features\nin multi-phase contrast-enhanced CT by fusing local and global features using\nCNN and transformer modules, further enhancing the features extracted across\nmulti-phase CT images. We extensively evaluated and compared the proposed\nmethod with existing methods in the multi-center (n=4) dataset with 1,070\npatients with PDAC, and statistical analysis confirmed its clinical\neffectiveness in the external test set consisting of three centers. The\ndeveloped risk marker was the strongest predictor of overall survival among\npreoperative factors and it has the potential to be combined with established\nclinical factors to select patients at higher risk who might benefit from\nneoadjuvant therapy.\n","authors":["Hexin Dong","Jiawen Yao","Yuxing Tang","Mingze Yuan","Yingda Xia","Jian Zhou","Hong Lu","Jingren Zhou","Bin Dong","Le Lu","Li Zhang","Zaiyi Liu","Yu Shi","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00507v2.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.06802v1","updated":"2023-09-13T08:50:00Z","published":"2023-09-13T08:50:00Z","title":"Dynamic NeRFs for Soccer Scenes","summary":" The long-standing problem of novel view synthesis has many applications,\nnotably in sports broadcasting. Photorealistic novel view synthesis of soccer\nactions, in particular, is of enormous interest to the broadcast industry. Yet\nonly a few industrial solutions have been proposed, and even fewer that achieve\nnear-broadcast quality of the synthetic replays. Except for their setup of\nmultiple static cameras around the playfield, the best proprietary systems\ndisclose close to no information about their inner workings. Leveraging\nmultiple static cameras for such a task indeed presents a challenge rarely\ntackled in the literature, for a lack of public datasets: the reconstruction of\na large-scale, mostly static environment, with small, fast-moving elements.\nRecently, the emergence of neural radiance fields has induced stunning progress\nin many novel view synthesis applications, leveraging deep learning principles\nto produce photorealistic results in the most challenging settings. In this\nwork, we investigate the feasibility of basing a solution to the task on\ndynamic NeRFs, i.e., neural models purposed to reconstruct general dynamic\ncontent. We compose synthetic soccer environments and conduct multiple\nexperiments using them, identifying key components that help reconstruct soccer\nscenes with dynamic NeRFs. We show that, although this approach cannot fully\nmeet the quality requirements for the target application, it suggests promising\navenues toward a cost-efficient, automatic solution. We also make our work\ndataset and code publicly available, with the goal to encourage further efforts\nfrom the research community on the task of novel view synthesis for dynamic\nsoccer scenes. For code, data, and video results, please see\nhttps://soccernerfs.isach.be.\n","authors":["Sacha Lewin","Maxime Vandegar","Thomas Hoyoux","Olivier Barnich","Gilles Louppe"],"pdf_url":"https://arxiv.org/pdf/2309.06802v1.pdf","comment":"Accepted at the 6th International ACM Workshop on Multimedia Content\n Analysis in Sports. 8 pages, 9 figures. Project page:\n https://soccernerfs.isach.be"},{"id":"http://arxiv.org/abs/2301.00545v3","updated":"2023-09-13T08:47:35Z","published":"2023-01-02T07:13:28Z","title":"Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels","summary":" A noisy training set usually leads to the degradation of the generalization\nand robustness of neural networks. In this paper, we propose a novel\ntheoretically guaranteed clean sample selection framework for learning with\nnoisy labels. Specifically, we first present a Scalable Penalized Regression\n(SPR) method, to model the linear relation between network features and one-hot\nlabels. In SPR, the clean data are identified by the zero mean-shift parameters\nsolved in the regression model. We theoretically show that SPR can recover\nclean data under some conditions. Under general scenarios, the conditions may\nbe no longer satisfied; and some noisy data are falsely selected as clean data.\nTo solve this problem, we propose a data-adaptive method for Scalable Penalized\nRegression with Knockoff filters (Knockoffs-SPR), which is provable to control\nthe False-Selection-Rate (FSR) in the selected clean data. To improve the\nefficiency, we further present a split algorithm that divides the whole\ntraining set into small pieces that can be solved in parallel to make the\nframework scalable to large datasets. While Knockoffs-SPR can be regarded as a\nsample selection module for a standard supervised training pipeline, we further\ncombine it with a semi-supervised algorithm to exploit the support of noisy\ndata as unlabeled data. Experimental results on several benchmark datasets and\nreal-world noisy datasets show the effectiveness of our framework and validate\nthe theoretical results of Knockoffs-SPR. Our code and pre-trained models are\navailable at https://github.com/Yikai-Wang/Knockoffs-SPR.\n","authors":["Yikai Wang","Yanwei Fu","Xinwei Sun"],"pdf_url":"https://arxiv.org/pdf/2301.00545v3.pdf","comment":"update: refined theory and analysis, release code"},{"id":"http://arxiv.org/abs/2309.06792v1","updated":"2023-09-13T08:27:24Z","published":"2023-09-13T08:27:24Z","title":"Motion-Bias-Free Feature-Based SLAM","summary":" For SLAM to be safely deployed in unstructured real world environments, it\nmust possess several key properties that are not encompassed by conventional\nbenchmarks. In this paper we show that SLAM commutativity, that is, consistency\nin trajectory estimates on forward and reverse traverses of the same route, is\na significant issue for the state of the art. Current pipelines show a\nsignificant bias between forward and reverse directions of travel, that is in\naddition inconsistent regarding which direction of travel exhibits better\nperformance. In this paper we propose several contributions to feature-based\nSLAM pipelines that remedies the motion bias problem. In a comprehensive\nevaluation across four datasets, we show that our contributions implemented in\nORB-SLAM2 substantially reduce the bias between forward and backward motion and\nadditionally improve the aggregated trajectory error. Removing the SLAM motion\nbias has significant relevance for the wide range of robotics and computer\nvision applications where performance consistency is important.\n","authors":["Alejandro Fontan","Javier Civera","Michael Milford"],"pdf_url":"https://arxiv.org/pdf/2309.06792v1.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2309.06067v2","updated":"2023-09-13T08:14:35Z","published":"2023-09-12T09:07:03Z","title":"Batch Implicit Neural Representation for MRI Parallel Reconstruction","summary":" Magnetic resonance imaging (MRI) always suffered from the problem of long\nacquisition time. MRI reconstruction is one solution to reduce scan time by\nskipping certain phase-encoding lines and then restoring high-quality images\nfrom undersampled measurements. Recently, implicit neural representation (INR)\nhas emerged as a new deep learning method that represents an object as a\ncontinuous function of spatial coordinates, and this function is normally\nparameterized by a multilayer perceptron (MLP). In this paper, we propose a\nnovel MRI reconstruction method based on INR, which represents the\nfully-sampled images as the function of pixel coordinates and prior feature\nvectors of undersampled images for overcoming the generalization problem of\nINR. Specifically, we introduce a scale-embedded encoder to produce\nscale-independent pixel-specific features from MR images with different\nundersampled scales and then concatenate with coordinates vectors to recover\nfully-sampled MR images via an MLP, thus achieving arbitrary scale\nreconstruction. The performance of the proposed method was assessed by\nexperimenting on publicly available MRI datasets and compared with other\nreconstruction methods. Our quantitative evaluation demonstrates the\nsuperiority of the proposed method over alternative reconstruction methods.\n","authors":["Hao Li","Yusheng Zhou","Jianan Liu","Xiling Liu","Tao Huang","Zhihan Lv"],"pdf_url":"https://arxiv.org/pdf/2309.06067v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06118v2","updated":"2023-09-13T07:36:32Z","published":"2023-09-12T10:33:19Z","title":"C-RITNet: Set Infrared and Visible Image Fusion Free from Complementary\n Information Mining","summary":" Infrared and visible image fusion (IVIF) aims to extract and integrate the\ncomplementary information in two different modalities to generate high-quality\nfused images with salient targets and abundant texture details. However,\ncurrent image fusion methods go to great lengths to excavate complementary\nfeatures, which is generally achieved through two efforts. On the one hand, the\nfeature extraction network is expected to have excellent performance in\nextracting complementary information. On the other hand, complex fusion\nstrategies are often designed to aggregate the complementary information. In\nother words, enabling the network to perceive and extract complementary\ninformation is extremely challenging. Complicated fusion strategies, while\neffective, still run the risk of losing weak edge details. To this end, this\npaper rethinks the IVIF outside the box, proposing a complementary-redundant\ninformation transfer network (C-RITNet). It reasonably transfers complementary\ninformation into redundant one, which integrates both the shared and\ncomplementary features from two modalities. Hence, the proposed method is able\nto alleviate the challenges posed by the complementary information extraction\nand reduce the reliance on sophisticated fusion strategies. Specifically, to\nskillfully sidestep aggregating complementary information in IVIF, we first\ndesign the mutual information transfer (MIT) module to mutually represent\nfeatures from two modalities, roughly transferring complementary information\ninto redundant one. Then, a redundant information acquisition supervised by\nsource image (RIASSI) module is devised to further ensure the\ncomplementary-redundant information transfer after MIT. Meanwhile, we also\npropose a structure information preservation (SIP) module to guarantee that the\nedge structure information of the source images can be transferred to the\nfusion results.\n","authors":["Yafei Zhang","Keying Du","Huafeng Li","Zhengtao Yu","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14735v2","updated":"2023-09-13T07:21:44Z","published":"2023-07-27T09:43:06Z","title":"Test Time Adaptation for Blind Image Quality Assessment","summary":" While the design of blind image quality assessment (IQA) algorithms has\nimproved significantly, the distribution shift between the training and testing\nscenarios often leads to a poor performance of these methods at inference time.\nThis motivates the study of test time adaptation (TTA) techniques to improve\ntheir performance at inference time. Existing auxiliary tasks and loss\nfunctions used for TTA may not be relevant for quality-aware adaptation of the\npre-trained model. In this work, we introduce two novel quality-relevant\nauxiliary tasks at the batch and sample levels to enable TTA for blind IQA. In\nparticular, we introduce a group contrastive loss at the batch level and a\nrelative rank loss at the sample level to make the model quality aware and\nadapt to the target data. Our experiments reveal that even using a small batch\nof images from the test distribution helps achieve significant improvement in\nperformance by updating the batch normalization statistics of the source model.\n","authors":["Subhadeep Roy","Shankhanil Mitra","Soma Biswas","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2307.14735v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.06751v1","updated":"2023-09-13T06:48:32Z","published":"2023-09-13T06:48:32Z","title":"Remote Sensing Object Detection Meets Deep Learning: A Meta-review of\n Challenges and Advances","summary":" Remote sensing object detection (RSOD), one of the most fundamental and\nchallenging tasks in the remote sensing field, has received longstanding\nattention. In recent years, deep learning techniques have demonstrated robust\nfeature representation capabilities and led to a big leap in the development of\nRSOD techniques. In this era of rapid technical evolution, this review aims to\npresent a comprehensive review of the recent achievements in deep learning\nbased RSOD methods. More than 300 papers are covered in this review. We\nidentify five main challenges in RSOD, including multi-scale object detection,\nrotated object detection, weak object detection, tiny object detection, and\nobject detection with limited supervision, and systematically review the\ncorresponding methods developed in a hierarchical division manner. We also\nreview the widely used benchmark datasets and evaluation metrics within the\nfield of RSOD, as well as the application scenarios for RSOD. Future research\ndirections are provided for further promoting the research in RSOD.\n","authors":["Xiangrong Zhang","Tianyang Zhang","Guanchun Wang","Peng Zhu","Xu Tang","Xiuping Jia","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2309.06751v1.pdf","comment":"Accepted with IEEE Geoscience and Remote Sensing Magazine. More than\n 300 papers relevant to the RSOD filed were reviewed in this survey"},{"id":"http://arxiv.org/abs/2309.06750v1","updated":"2023-09-13T06:46:27Z","published":"2023-09-13T06:46:27Z","title":"MFL-YOLO: An Object Detection Model for Damaged Traffic Signs","summary":" Traffic signs are important facilities to ensure traffic safety and smooth\nflow, but may be damaged due to many reasons, which poses a great safety\nhazard. Therefore, it is important to study a method to detect damaged traffic\nsigns. Existing object detection techniques for damaged traffic signs are still\nabsent. Since damaged traffic signs are closer in appearance to normal ones, it\nis difficult to capture the detailed local damage features of damaged traffic\nsigns using traditional object detection methods. In this paper, we propose an\nimproved object detection method based on YOLOv5s, namely MFL-YOLO (Mutual\nFeature Levels Loss enhanced YOLO). We designed a simple cross-level loss\nfunction so that each level of the model has its own role, which is beneficial\nfor the model to be able to learn more diverse features and improve the fine\ngranularity. The method can be applied as a plug-and-play module and it does\nnot increase the structural complexity or the computational complexity while\nimproving the accuracy. We also replaced the traditional convolution and CSP\nwith the GSConv and VoVGSCSP in the neck of YOLOv5s to reduce the scale and\ncomputational complexity. Compared with YOLOv5s, our MFL-YOLO improves 4.3 and\n5.1 in F1 scores and mAP, while reducing the FLOPs by 8.9%. The Grad-CAM heat\nmap visualization shows that our model can better focus on the local details of\nthe damaged traffic signs. In addition, we also conducted experiments on\nCCTSDB2021 and TT100K to further validate the generalization of our model.\n","authors":["Tengyang Chen","Jiangtao Ren"],"pdf_url":"https://arxiv.org/pdf/2309.06750v1.pdf","comment":"11 pages, 8 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.06747v1","updated":"2023-09-13T06:38:51Z","published":"2023-09-13T06:38:51Z","title":"Integrating GAN and Texture Synthesis for Enhanced Road Damage Detection","summary":" In the domain of traffic safety and road maintenance, precise detection of\nroad damage is crucial for ensuring safe driving and prolonging road\ndurability. However, current methods often fall short due to limited data.\nPrior attempts have used Generative Adversarial Networks to generate damage\nwith diverse shapes and manually integrate it into appropriate positions.\nHowever, the problem has not been well explored and is faced with two\nchallenges. First, they only enrich the location and shape of damage while\nneglect the diversity of severity levels, and the realism still needs further\nimprovement. Second, they require a significant amount of manual effort. To\naddress these challenges, we propose an innovative approach. In addition to\nusing GAN to generate damage with various shapes, we further employ texture\nsynthesis techniques to extract road textures. These two elements are then\nmixed with different weights, allowing us to control the severity of the\nsynthesized damage, which are then embedded back into the original images via\nPoisson blending. Our method ensures both richness of damage severity and a\nbetter alignment with the background. To save labor costs, we leverage\nstructural similarity for automated sample selection during embedding. Each\naugmented data of an original image contains versions with varying severity\nlevels. We implement a straightforward screening strategy to mitigate\ndistribution drift. Experiments are conducted on a public road damage dataset.\nThe proposed method not only eliminates the need for manual labor but also\nachieves remarkable enhancements, improving the mAP by 4.1% and the F1-score by\n4.5%.\n","authors":["Tengyang Chen","Jiangtao Ren"],"pdf_url":"https://arxiv.org/pdf/2309.06747v1.pdf","comment":"10 pages, 13 figures, 2 Tables"},{"id":"http://arxiv.org/abs/2303.06315v2","updated":"2023-09-13T06:35:44Z","published":"2023-03-11T05:23:20Z","title":"DETA: Denoised Task Adaptation for Few-Shot Learning","summary":" Test-time task adaptation in few-shot learning aims to adapt a pre-trained\ntask-agnostic model for capturing taskspecific knowledge of the test task, rely\nonly on few-labeled support samples. Previous approaches generally focus on\ndeveloping advanced algorithms to achieve the goal, while neglecting the\ninherent problems of the given support samples. In fact, with only a handful of\nsamples available, the adverse effect of either the image noise (a.k.a.\nX-noise) or the label noise (a.k.a. Y-noise) from support samples can be\nseverely amplified. To address this challenge, in this work we propose DEnoised\nTask Adaptation (DETA), a first, unified image- and label-denoising framework\northogonal to existing task adaptation approaches. Without extra supervision,\nDETA filters out task-irrelevant, noisy representations by taking advantage of\nboth global visual information and local region details of support samples. On\nthe challenging Meta-Dataset, DETA consistently improves the performance of a\nbroad spectrum of baseline methods applied on various pre-trained models.\nNotably, by tackling the overlooked image noise in Meta-Dataset, DETA\nestablishes new state-of-the-art results. Code is released at\nhttps://github.com/JimZAI/DETA.\n","authors":["Ji Zhang","Lianli Gao","Xu Luo","Hengtao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2303.06315v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/1910.09455v2","updated":"2023-09-13T06:35:43Z","published":"2019-10-21T15:37:53Z","title":"Depth-wise Decomposition for Accelerating Separable Convolutions in\n Efficient Convolutional Neural Networks","summary":" Very deep convolutional neural networks (CNNs) have been firmly established\nas the primary methods for many computer vision tasks. However, most\nstate-of-the-art CNNs are large, which results in high inference latency.\nRecently, depth-wise separable convolution has been proposed for image\nrecognition tasks on computationally limited platforms such as robotics and\nself-driving cars. Though it is much faster than its counterpart, regular\nconvolution, accuracy is sacrificed. In this paper, we propose a novel\ndecomposition approach based on SVD, namely depth-wise decomposition, for\nexpanding regular convolutions into depthwise separable convolutions while\nmaintaining high accuracy. We show our approach can be further generalized to\nthe multi-channel and multi-layer cases, based on Generalized Singular Value\nDecomposition (GSVD) [59]. We conduct thorough experiments with the latest\nShuffleNet V2 model [47] on both random synthesized dataset and a large-scale\nimage recognition dataset: ImageNet [10]. Our approach outperforms channel\ndecomposition [73] on all datasets. More importantly, our approach improves the\nTop-1 accuracy of ShuffleNet V2 by ~2%.\n","authors":["Yihui He","Jianing Qian","Jianren Wang"],"pdf_url":"https://arxiv.org/pdf/1910.09455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06745v1","updated":"2023-09-13T06:31:35Z","published":"2023-09-13T06:31:35Z","title":"VEATIC: Video-based Emotion and Affect Tracking in Context Dataset","summary":" Human affect recognition has been a significant topic in psychophysics and\ncomputer vision. However, the currently published datasets have many\nlimitations. For example, most datasets contain frames that contain only\ninformation about facial expressions. Due to the limitations of previous\ndatasets, it is very hard to either understand the mechanisms for affect\nrecognition of humans or generalize well on common cases for computer vision\nmodels trained on those datasets. In this work, we introduce a brand new large\ndataset, the Video-based Emotion and Affect Tracking in Context Dataset\n(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has\n124 video clips from Hollywood movies, documentaries, and home videos with\ncontinuous valence and arousal ratings of each frame via real-time annotation.\nAlong with the dataset, we propose a new computer vision task to infer the\naffect of the selected character via both context and character information in\neach video frame. Additionally, we propose a simple model to benchmark this new\ncomputer vision task. We also compare the performance of the pretrained model\nusing our dataset with other similar datasets. Experiments show the competing\nresults of our pretrained model via VEATIC, indicating the generalizability of\nVEATIC. Our dataset is available at https://veatic.github.io.\n","authors":["Zhihang Ren","Jefferson Ortega","Yifan Wang","Zhimin Chen","David Whitney","Yunhui Guo","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2309.06745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01582v3","updated":"2023-09-13T06:25:55Z","published":"2023-09-04T13:10:11Z","title":"Improving Visual Quality and Transferability of Adversarial Attacks on\n Face Recognition Simultaneously with Adversarial Restoration","summary":" Adversarial face examples possess two critical properties: Visual Quality and\nTransferability. However, existing approaches rarely address these properties\nsimultaneously, leading to subpar results. To address this issue, we propose a\nnovel adversarial attack technique known as Adversarial Restoration\n(AdvRestore), which enhances both visual quality and transferability of\nadversarial face examples by leveraging a face restoration prior. In our\napproach, we initially train a Restoration Latent Diffusion Model (RLDM)\ndesigned for face restoration. Subsequently, we employ the inference process of\nRLDM to generate adversarial face examples. The adversarial perturbations are\napplied to the intermediate features of RLDM. Additionally, by treating RLDM\nface restoration as a sibling task, the transferability of the generated\nadversarial face examples is further improved. Our experimental results\nvalidate the effectiveness of the proposed attack method.\n","authors":["Fengfan Zhou","Hefei Ling","Yuxuan Shi","Jiazhong Chen","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2309.01582v3.pdf","comment":"\\copyright 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2309.06742v1","updated":"2023-09-13T06:23:58Z","published":"2023-09-13T06:23:58Z","title":"MTD: Multi-Timestep Detector for Delayed Streaming Perception","summary":" Autonomous driving systems require real-time environmental perception to\nensure user safety and experience. Streaming perception is a task of reporting\nthe current state of the world, which is used to evaluate the delay and\naccuracy of autonomous driving systems. In real-world applications, factors\nsuch as hardware limitations and high temperatures inevitably cause delays in\nautonomous driving systems, resulting in the offset between the model output\nand the world state. In order to solve this problem, this paper propose the\nMulti- Timestep Detector (MTD), an end-to-end detector which uses dynamic\nrouting for multi-branch future prediction, giving model the ability to resist\ndelay fluctuations. A Delay Analysis Module (DAM) is proposed to optimize the\nexisting delay sensing method, continuously monitoring the model inference\nstack and calculating the delay trend. Moreover, a novel Timestep Branch Module\n(TBM) is constructed, which includes static flow and adaptive flow to\nadaptively predict specific timesteps according to the delay trend. The\nproposed method has been evaluated on the Argoverse-HD dataset, and the\nexperimental results show that it has achieved state-of-the-art performance\nacross various delay settings.\n","authors":["Yihui Huang","Ningjiang Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06742v1.pdf","comment":"12 pages, accepted by PRCV 2023 (The 6th Chinese Conference on\n Pattern Recognition and Computer Vision)"},{"id":"http://arxiv.org/abs/2301.10908v4","updated":"2023-09-13T06:11:12Z","published":"2023-01-26T02:38:37Z","title":"Distilling Cognitive Backdoor Patterns within an Image","summary":" This paper proposes a simple method to distill and detect backdoor patterns\nwithin an image: \\emph{Cognitive Distillation} (CD). The idea is to extract the\n\"minimal essence\" from an input image responsible for the model's prediction.\nCD optimizes an input mask to extract a small pattern from the input image that\ncan lead to the same model output (i.e., logits or deep features). The\nextracted pattern can help understand the cognitive mechanism of a model on\nclean vs. backdoor images and is thus called a \\emph{Cognitive Pattern} (CP).\nUsing CD and the distilled CPs, we uncover an interesting phenomenon of\nbackdoor attacks: despite the various forms and sizes of trigger patterns used\nby different attacks, the CPs of backdoor samples are all surprisingly and\nsuspiciously small. One thus can leverage the learned mask to detect and remove\nbackdoor examples from poisoned training datasets. We conduct extensive\nexperiments to show that CD can robustly detect a wide range of advanced\nbackdoor attacks. We also show that CD can potentially be applied to help\ndetect potential biases from face datasets. Code is available at\n\\url{https://github.com/HanxunH/CognitiveDistillation}.\n","authors":["Hanxun Huang","Xingjun Ma","Sarah Erfani","James Bailey"],"pdf_url":"https://arxiv.org/pdf/2301.10908v4.pdf","comment":"ICLR2023"},{"id":"http://arxiv.org/abs/2309.06735v1","updated":"2023-09-13T05:48:35Z","published":"2023-09-13T05:48:35Z","title":"GelFlow: Self-supervised Learning of Optical Flow for Vision-Based\n Tactile Sensor Displacement Measurement","summary":" High-resolution multi-modality information acquired by vision-based tactile\nsensors can support more dexterous manipulations for robot fingers. Optical\nflow is low-level information directly obtained by vision-based tactile\nsensors, which can be transformed into other modalities like force, geometry\nand depth. Current vision-tactile sensors employ optical flow methods from\nOpenCV to estimate the deformation of markers in gels. However, these methods\nneed to be more precise for accurately measuring the displacement of markers\nduring large elastic deformation of the gel, as this can significantly impact\nthe accuracy of downstream tasks. This study proposes a self-supervised optical\nflow method based on deep learning to achieve high accuracy in displacement\nmeasurement for vision-based tactile sensors. The proposed method employs a\ncoarse-to-fine strategy to handle large deformations by constructing a\nmulti-scale feature pyramid from the input image. To better deal with the\nelastic deformation caused by the gel, the Helmholtz velocity decomposition\nconstraint combined with the elastic deformation constraint are adopted to\naddress the distortion rate and area change rate, respectively. A local flow\nfusion module is designed to smooth the optical flow, taking into account the\nprior knowledge of the blurred effect of gel deformation. We trained the\nproposed self-supervised network using an open-source dataset and compared it\nwith traditional and deep learning-based optical flow methods. The results show\nthat the proposed method achieved the highest displacement measurement\naccuracy, thereby demonstrating its potential for enabling more precise\nmeasurement of downstream tasks using vision-based tactile sensors.\n","authors":["Zhiyuan Zhang","Hua Yang","Zhouping Yin"],"pdf_url":"https://arxiv.org/pdf/2309.06735v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06323v2","updated":"2023-09-13T05:43:53Z","published":"2023-09-12T15:33:09Z","title":"SAMPLING: Scene-adaptive Hierarchical Multiplane Images Representation\n for Novel View Synthesis from a Single Image","summary":" Recent novel view synthesis methods obtain promising results for relatively\nsmall scenes, e.g., indoor environments and scenes with a few objects, but tend\nto fail for unbounded outdoor scenes with a single image as input. In this\npaper, we introduce SAMPLING, a Scene-adaptive Hierarchical Multiplane Images\nRepresentation for Novel View Synthesis from a Single Image based on improved\nmultiplane images (MPI). Observing that depth distribution varies significantly\nfor unbounded outdoor scenes, we employ an adaptive-bins strategy for MPI to\narrange planes in accordance with each scene image. To represent intricate\ngeometry and multi-scale details, we further introduce a hierarchical\nrefinement branch, which results in high-quality synthesized novel views. Our\nmethod demonstrates considerable performance gains in synthesizing large-scale\nunbounded outdoor scenes using a single image on the KITTI dataset and\ngeneralizes well to the unseen Tanks and Temples dataset.The code and models\nwill soon be made available.\n","authors":["Xiaoyu Zhou","Zhiwei Lin","Xiaojun Shan","Yongtao Wang","Deqing Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06323v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06728v1","updated":"2023-09-13T05:05:47Z","published":"2023-09-13T05:05:47Z","title":"Leveraging Foundation models for Unsupervised Audio-Visual Segmentation","summary":" Audio-Visual Segmentation (AVS) aims to precisely outline audible objects in\na visual scene at the pixel level. Existing AVS methods require fine-grained\nannotations of audio-mask pairs in supervised learning fashion. This limits\ntheir scalability since it is time consuming and tedious to acquire such\ncross-modality pixel level labels. To overcome this obstacle, in this work we\nintroduce unsupervised audio-visual segmentation with no need for task-specific\ndata annotations and model training. For tackling this newly proposed problem,\nwe formulate a novel Cross-Modality Semantic Filtering (CMSF) approach to\naccurately associate the underlying audio-mask pairs by leveraging the\noff-the-shelf multi-modal foundation models (e.g., detection [1], open-world\nsegmentation [2] and multi-modal alignment [3]). Guiding the proposal\ngeneration by either audio or visual cues, we design two training-free\nvariants: AT-GDINO-SAM and OWOD-BIND. Extensive experiments on the AVS-Bench\ndataset show that our unsupervised approach can perform well in comparison to\nprior art supervised counterparts across complex scenarios with multiple\nauditory objects. Particularly, in situations where existing supervised AVS\nmethods struggle with overlapping foreground objects, our models still excel in\naccurately segmenting overlapped auditory objects. Our code will be publicly\nreleased.\n","authors":["Swapnil Bhosale","Haosen Yang","Diptesh Kanojia","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.06728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06724v1","updated":"2023-09-13T04:57:12Z","published":"2023-09-13T04:57:12Z","title":"Deep Nonparametric Convexified Filtering for Computational Photography,\n Image Synthesis and Adversarial Defense","summary":" We aim to provide a general framework of for computational photography that\nrecovers the real scene from imperfect images, via the Deep Nonparametric\nConvexified Filtering (DNCF). It is consists of a nonparametric deep network to\nresemble the physical equations behind the image formation, such as denoising,\nsuper-resolution, inpainting, and flash. DNCF has no parameterization dependent\non training data, therefore has a strong generalization and robustness to\nadversarial image manipulation. During inference, we also encourage the network\nparameters to be nonnegative and create a bi-convex function on the input and\nparameters, and this adapts to second-order optimization algorithms with\ninsufficient running time, having 10X acceleration over Deep Image Prior. With\nthese tools, we empirically verify its capability to defend image\nclassification deep networks against adversary attack algorithms in real-time.\n","authors":["Jianqiao Wangni"],"pdf_url":"https://arxiv.org/pdf/2309.06724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06721v1","updated":"2023-09-13T04:51:15Z","published":"2023-09-13T04:51:15Z","title":"Dynamic Spectrum Mixer for Visual Recognition","summary":" Recently, MLP-based vision backbones have achieved promising performance in\nseveral visual recognition tasks. However, the existing MLP-based methods\ndirectly aggregate tokens with static weights, leaving the adaptability to\ndifferent images untouched. Moreover, Recent research demonstrates that\nMLP-Transformer is great at creating long-range dependencies but ineffective at\ncatching high frequencies that primarily transmit local information, which\nprevents it from applying to the downstream dense prediction tasks, such as\nsemantic segmentation. To address these challenges, we propose a\ncontent-adaptive yet computationally efficient structure, dubbed Dynamic\nSpectrum Mixer (DSM). The DSM represents token interactions in the frequency\ndomain by employing the Discrete Cosine Transform, which can learn long-term\nspatial dependencies with log-linear complexity. Furthermore, a dynamic\nspectrum weight generation layer is proposed as the spectrum bands selector,\nwhich could emphasize the informative frequency bands while diminishing others.\nTo this end, the technique can efficiently learn detailed features from visual\ninput that contains both high- and low-frequency information. Extensive\nexperiments show that DSM is a powerful and adaptable backbone for a range of\nvisual recognition tasks. Particularly, DSM outperforms previous\ntransformer-based and MLP-based models, on image classification, object\ndetection, and semantic segmentation tasks, such as 83.8 \\% top-1 accuracy on\nImageNet, and 49.9 \\% mIoU on ADE20K.\n","authors":["Zhiqiang Hu","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2309.06721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06720v1","updated":"2023-09-13T04:49:49Z","published":"2023-09-13T04:49:49Z","title":"Deep Attentive Time Warping","summary":" Similarity measures for time series are important problems for time series\nclassification. To handle the nonlinear time distortions, Dynamic Time Warping\n(DTW) has been widely used. However, DTW is not learnable and suffers from a\ntrade-off between robustness against time distortion and discriminative power.\nIn this paper, we propose a neural network model for task-adaptive time\nwarping. Specifically, we use the attention model, called the bipartite\nattention model, to develop an explicit time warping mechanism with greater\ndistortion invariance. Unlike other learnable models using DTW for warping, our\nmodel predicts all local correspondences between two time series and is trained\nbased on metric learning, which enables it to learn the optimal data-dependent\nwarping for the target task. We also propose to induce pre-training of our\nmodel by DTW to improve the discriminative power. Extensive experiments\ndemonstrate the superior effectiveness of our model over DTW and its\nstate-of-the-art performance in online signature verification.\n","authors":["Shinnosuke Matsuo","Xiaomeng Wu","Gantugs Atarsaikhan","Akisato Kimura","Kunio Kashino","Brian Kenji Iwana","Seiichi Uchida"],"pdf_url":"https://arxiv.org/pdf/2309.06720v1.pdf","comment":"Accepted at Pattern Recognition"},{"id":"http://arxiv.org/abs/2309.06714v1","updated":"2023-09-13T04:31:00Z","published":"2023-09-13T04:31:00Z","title":"MPI-Flow: Learning Realistic Optical Flow with Multiplane Images","summary":" The accuracy of learning-based optical flow estimation models heavily relies\non the realism of the training datasets. Current approaches for generating such\ndatasets either employ synthetic data or generate images with limited realism.\nHowever, the domain gap of these data with real-world scenes constrains the\ngeneralization of the trained model to real-world applications. To address this\nissue, we investigate generating realistic optical flow datasets from\nreal-world images. Firstly, to generate highly realistic new images, we\nconstruct a layered depth representation, known as multiplane images (MPI),\nfrom single-view images. This allows us to generate novel view images that are\nhighly realistic. To generate optical flow maps that correspond accurately to\nthe new image, we calculate the optical flows of each plane using the camera\nmatrix and plane depths. We then project these layered optical flows into the\noutput optical flow map with volume rendering. Secondly, to ensure the realism\nof motion, we present an independent object motion module that can separate the\ncamera and dynamic object motion in MPI. This module addresses the deficiency\nin MPI-based single-view methods, where optical flow is generated only by\ncamera motion and does not account for any object movement. We additionally\ndevise a depth-aware inpainting module to merge new images with dynamic objects\nand address unnatural motion occlusions. We show the superior performance of\nour method through extensive experiments on real-world datasets. Moreover, our\napproach achieves state-of-the-art performance in both unsupervised and\nsupervised training of learning-based models. The code will be made publicly\navailable at: \\url{https://github.com/Sharpiless/MPI-Flow}.\n","authors":["Yingping Liang","Jiaming Liu","Debing Zhang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2309.06714v1.pdf","comment":"Accepted to ICCV2023"},{"id":"http://arxiv.org/abs/1911.10737v5","updated":"2023-09-13T04:03:20Z","published":"2019-11-25T07:31:54Z","title":"Nearest Neighbor Sampling of Point Sets using Rays","summary":" We propose a new framework for the sampling, compression, and analysis of\ndistributions of point sets and other geometric objects embedded in Euclidean\nspaces. Our approach involves constructing a tensor called the RaySense sketch,\nwhich captures nearest neighbors from the underlying geometry of points along a\nset of rays. We explore various operations that can be performed on the\nRaySense sketch, leading to different properties and potential applications.\nStatistical information about the data set can be extracted from the sketch,\nindependent of the ray set. Line integrals on point sets can be efficiently\ncomputed using the sketch. We also present several examples illustrating\napplications of the proposed strategy in practical scenarios.\n","authors":["Liangchen Liu","Louis Ly","Colin Macdonald","Yen-Hsi Richard Tsai"],"pdf_url":"https://arxiv.org/pdf/1911.10737v5.pdf","comment":"48 pages, 14 figures, accepted to Communication on Applied\n Mathematics and Computation (CAMC), Focused Issue in Honor of Prof. Stanley\n Osher on the Occasion of His 80th Birthday. Fixed typos and improved\n notations"},{"id":"http://arxiv.org/abs/2309.06703v1","updated":"2023-09-13T04:02:38Z","published":"2023-09-13T04:02:38Z","title":"VLSlice: Interactive Vision-and-Language Slice Discovery","summary":" Recent work in vision-and-language demonstrates that large-scale pretraining\ncan learn generalizable models that are efficiently transferable to downstream\ntasks. While this may improve dataset-scale aggregate metrics, analyzing\nperformance around hand-crafted subgroups targeting specific bias dimensions\nreveals systemic undesirable behaviors. However, this subgroup analysis is\nfrequently stalled by annotation efforts, which require extensive time and\nresources to collect the necessary data. Prior art attempts to automatically\ndiscover subgroups to circumvent these constraints but typically leverages\nmodel behavior on existing task-specific annotations and rapidly degrades on\nmore complex inputs beyond \"tabular\" data, none of which study\nvision-and-language models. This paper presents VLSlice, an interactive system\nenabling user-guided discovery of coherent representation-level subgroups with\nconsistent visiolinguistic behavior, denoted as vision-and-language slices,\nfrom unlabeled image sets. We show that VLSlice enables users to quickly\ngenerate diverse high-coherency slices in a user study (n=22) and release the\ntool publicly.\n","authors":["Eric Slyman","Minsuk Kahng","Stefan Lee"],"pdf_url":"https://arxiv.org/pdf/2309.06703v1.pdf","comment":"Conference paper at ICCV 2023. 17 pages, 11 figures.\n https://ericslyman.com/vlslice/"},{"id":"http://arxiv.org/abs/2309.06701v1","updated":"2023-09-13T03:52:09Z","published":"2023-09-13T03:52:09Z","title":"Transparent Object Tracking with Enhanced Fusion Module","summary":" Accurate tracking of transparent objects, such as glasses, plays a critical\nrole in many robotic tasks such as robot-assisted living. Due to the adaptive\nand often reflective texture of such objects, traditional tracking algorithms\nthat rely on general-purpose learned features suffer from reduced performance.\nRecent research has proposed to instill transparency awareness into existing\ngeneral object trackers by fusing purpose-built features. However, with the\nexisting fusion techniques, the addition of new features causes a change in the\nlatent space making it impossible to incorporate transparency awareness on\ntrackers with fixed latent spaces. For example, many of the current days\ntransformer-based trackers are fully pre-trained and are sensitive to any\nlatent space perturbations. In this paper, we present a new feature fusion\ntechnique that integrates transparency information into a fixed feature space,\nenabling its use in a broader range of trackers. Our proposed fusion module,\ncomposed of a transformer encoder and an MLP module, leverages key query-based\ntransformations to embed the transparency information into the tracking\npipeline. We also present a new two-step training strategy for our fusion\nmodule to effectively merge transparency features. We propose a new tracker\narchitecture that uses our fusion techniques to achieve superior results for\ntransparent object tracking. Our proposed method achieves competitive results\nwith state-of-the-art trackers on TOTB, which is the largest transparent object\ntracking benchmark recently released. Our results and the implementation of\ncode will be made publicly available at https://github.com/kalyan0510/TOTEM.\n","authors":["Kalyan Garigapati","Erik Blasch","Jie Wei","Haibin Ling"],"pdf_url":"https://arxiv.org/pdf/2309.06701v1.pdf","comment":"IEEE IROS 2023"},{"id":"http://arxiv.org/abs/2307.11336v2","updated":"2023-09-13T03:10:09Z","published":"2023-07-21T03:50:23Z","title":"Character Time-series Matching For Robust License Plate Recognition","summary":" Automatic License Plate Recognition (ALPR) is becoming a popular study area\nand is applied in many fields such as transportation or smart city. However,\nthere are still several limitations when applying many current methods to\npractical problems due to the variation in real-world situations such as light\nchanges, unclear License Plate (LP) characters, and image quality. Almost\nrecent ALPR algorithms process on a single frame, which reduces accuracy in\ncase of worse image quality. This paper presents methods to improve license\nplate recognition accuracy by tracking the license plate in multiple frames.\nFirst, the Adaptive License Plate Rotation algorithm is applied to correctly\nalign the detected license plate. Second, we propose a method called Character\nTime-series Matching to recognize license plate characters from many\nconsequence frames. The proposed method archives high performance in the\nUFPR-ALPR dataset which is \\boldmath$96.7\\%$ accuracy in real-time on RTX A5000\nGPU card. We also deploy the algorithm for the Vietnamese ALPR system. The\naccuracy for license plate detection and character recognition are 0.881 and\n0.979 $mAP^{test}$@.5 respectively. The source code is available at\nhttps://github.com/chequanghuy/Character-Time-series-Matching.git\n","authors":["Quang Huy Che","Tung Do Thanh","Cuong Truong Van"],"pdf_url":"https://arxiv.org/pdf/2307.11336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16799v2","updated":"2023-09-13T02:48:16Z","published":"2022-11-30T07:33:14Z","title":"NOPE-SAC: Neural One-Plane RANSAC for Sparse-View Planar 3D\n Reconstruction","summary":" This paper studies the challenging two-view 3D reconstruction in a rigorous\nsparse-view configuration, which is suffering from insufficient correspondences\nin the input image pairs for camera pose estimation. We present a novel Neural\nOne-PlanE RANSAC framework (termed NOPE-SAC in short) that exerts excellent\ncapability to learn one-plane pose hypotheses from 3D plane correspondences.\nBuilding on the top of a siamese plane detection network, our NOPE-SAC first\ngenerates putative plane correspondences with a coarse initial pose. It then\nfeeds the learned 3D plane parameters of correspondences into shared MLPs to\nestimate the one-plane camera pose hypotheses, which are subsequently reweighed\nin a RANSAC manner to obtain the final camera pose. Because the neural\none-plane pose minimizes the number of plane correspondences for adaptive pose\nhypotheses generation, it enables stable pose voting and reliable pose\nrefinement in a few plane correspondences for the sparse-view inputs. In the\nexperiments, we demonstrate that our NOPE-SAC significantly improves the camera\npose estimation for the two-view inputs with severe viewpoint changes, setting\nseveral new state-of-the-art performances on two challenging benchmarks, i.e.,\nMatterPort3D and ScanNet, for sparse-view 3D reconstruction. The source code is\nreleased at https://github.com/IceTTTb/NopeSAC for reproducible research.\n","authors":["Bin Tan","Nan Xue","Tianfu Wu","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2211.16799v2.pdf","comment":"Accepted to IEEE TPAMI; Code is available at\n https://github.com/IceTTTb/NopeSAC"},{"id":"http://arxiv.org/abs/2309.06680v1","updated":"2023-09-13T02:35:59Z","published":"2023-09-13T02:35:59Z","title":"STUPD: A Synthetic Dataset for Spatial and Temporal Relation Reasoning","summary":" Understanding relations between objects is crucial for understanding the\nsemantics of a visual scene. It is also an essential step in order to bridge\nvisual and language models. However, current state-of-the-art computer vision\nmodels still lack the ability to perform spatial reasoning well. Existing\ndatasets mostly cover a relatively small number of spatial relations, all of\nwhich are static relations that do not intrinsically involve motion. In this\npaper, we propose the Spatial and Temporal Understanding of Prepositions\nDataset (STUPD) -- a large-scale video dataset for understanding static and\ndynamic spatial relationships derived from prepositions of the English\nlanguage. The dataset contains 150K visual depictions (videos and images),\nconsisting of 30 distinct spatial prepositional senses, in the form of object\ninteraction simulations generated synthetically using Unity3D. In addition to\nspatial relations, we also propose 50K visual depictions across 10 temporal\nrelations, consisting of videos depicting event/time-point interactions. To our\nknowledge, no dataset exists that represents temporal relations through visual\nsettings. In this dataset, we also provide 3D information about object\ninteractions such as frame-wise coordinates, and descriptions of the objects\nused. The goal of this synthetic dataset is to help models perform better in\nvisual relationship detection in real-world settings. We demonstrate an\nincrease in the performance of various models over 2 real-world datasets\n(ImageNet-VidVRD and Spatial Senses) when pretrained on the STUPD dataset, in\ncomparison to other pretraining datasets.\n","authors":["Palaash Agrawal","Haidi Azaman","Cheston Tan"],"pdf_url":"https://arxiv.org/pdf/2309.06680v1.pdf","comment":"Submitted to Neurips Dataset track. 24 pages including citations and\n appendix"},{"id":"http://arxiv.org/abs/2309.06677v1","updated":"2023-09-13T02:24:37Z","published":"2023-09-13T02:24:37Z","title":"SHARM: Segmented Head Anatomical Reference Models","summary":" Reliable segmentation of anatomical tissues of human head is a major step in\nseveral clinical applications such as brain mapping, surgery planning and\nassociated computational simulation studies. Segmentation is based on\nidentifying different anatomical structures through labeling different tissues\nthrough medical imaging modalities. The segmentation of brain structures is\ncommonly feasible with several remarkable contributions mainly for medical\nperspective; however, non-brain tissues are of less interest due to anatomical\ncomplexity and difficulties to be observed using standard medical imaging\nprotocols. The lack of whole head segmentation methods and unavailability of\nlarge human head segmented datasets limiting the variability studies,\nespecially in the computational evaluation of electrical brain stimulation\n(neuromodulation), human protection from electromagnetic field, and\nelectroencephalography where non-brain tissues are of great importance.\n To fill this gap, this study provides an open-access Segmented Head\nAnatomical Reference Models (SHARM) that consists of 196 subjects. These models\nare segmented into 15 different tissues; skin, fat, muscle, skull cancellous\nbone, skull cortical bone, brain white matter, brain gray matter, cerebellum\nwhite matter, cerebellum gray matter, cerebrospinal fluid, dura, vitreous\nhumor, lens, mucous tissue and blood vessels. The segmented head models are\ngenerated using open-access IXI MRI dataset through convolutional neural\nnetwork structure named ForkNet+. Results indicate a high consistency in\nstatistical characteristics of different tissue distribution in age scale with\nreal measurements. SHARM is expected to be a useful benchmark not only for\nelectromagnetic dosimetry studies but also for different human head\nsegmentation applications.\n","authors":["Essam A. Rashed","Mohammad al-Shatouri","Ilkka Laakso","Akimasa Hirata"],"pdf_url":"https://arxiv.org/pdf/2309.06677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06670v1","updated":"2023-09-13T02:15:29Z","published":"2023-09-13T02:15:29Z","title":"ShaDocFormer: A Shadow-attentive Threshold Detector with Cascaded Fusion\n Refiner for document shadow removal' to the ICASSP 2024 online submission\n system","summary":" Document shadow is a common issue that arise when capturing documents using\nmobile devices, which significantly impacts the readability. Current methods\nencounter various challenges including inaccurate detection of shadow masks and\nestimation of illumination. In this paper, we propose ShaDocFormer, a\nTransformer-based architecture that integrates traditional methodologies and\ndeep learning techniques to tackle the problem of document shadow removal. The\nShaDocFormer architecture comprises two components: the Shadow-attentive\nThreshold Detector (STD) and the Cascaded Fusion Refiner (CFR). The STD module\nemploys a traditional thresholding technique and leverages the attention\nmechanism of the Transformer to gather global information, thereby enabling\nprecise detection of shadow masks. The cascaded and aggregative structure of\nthe CFR module facilitates a coarse-to-fine restoration process for the entire\nimage. As a result, ShaDocFormer excels in accurately detecting and capturing\nvariations in both shadow and illumination, thereby enabling effective removal\nof shadows. Extensive experiments demonstrate that ShaDocFormer outperforms\ncurrent state-of-the-art methods in both qualitative and quantitative\nmeasurements.\n","authors":["Weiwen Chen","Shenghong Luo","Xuhang Chen","Zinuo Li","Shuqiang Wang","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2309.06670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15142v3","updated":"2023-09-13T02:06:22Z","published":"2023-06-27T02:03:46Z","title":"LRANet: Towards Accurate and Efficient Scene Text Detection with\n Low-Rank Approximation Network","summary":" Recently, regression-based methods, which predict parameterized text shapes\nfor text localization, have gained popularity in scene text detection. However,\nthe existing parameterized text shape methods still have limitations in\nmodeling arbitrary-shaped texts due to ignoring the utilization of\ntext-specific shape information. Moreover, the time consumption of the entire\npipeline has been largely overlooked, leading to a suboptimal overall inference\nspeed. To address these issues, we first propose a novel parameterized text\nshape method based on low-rank approximation. Unlike other shape representation\nmethods that employ data-irrelevant parameterization, our approach utilizes\nsingular value decomposition and reconstructs the text shape using a few\neigenvectors learned from labeled text contours. By exploring the shape\ncorrelation among different text contours, our method achieves consistency,\ncompactness, simplicity, and robustness in shape representation. Next, we\npropose a dual assignment scheme for speed acceleration. It adopts a sparse\nassignment branch to accelerate the inference speed, and meanwhile, provides\nample supervised signals for training through a dense assignment branch.\nBuilding upon these designs, we implement an accurate and efficient\narbitrary-shaped text detector named LRANet. Extensive experiments are\nconducted on several challenging benchmarks, demonstrating the superior\naccuracy and efficiency of LRANet compared to state-of-the-art methods. Code\nwill be released soon.\n","authors":["Yuchen Su","Zhineng Chen","Zhiwen Shao","Yuning Du","Zhilong Ji","Jinfeng Bai","Yong Zhou","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2306.15142v3.pdf","comment":"There were some errors in the experimental results of the first\n version, such as inaccurate measurement of FPS and low F-meansure"},{"id":"http://arxiv.org/abs/2309.06660v1","updated":"2023-09-13T01:22:16Z","published":"2023-09-13T01:22:16Z","title":"Generalizable Neural Fields as Partially Observed Neural Processes","summary":" Neural fields, which represent signals as a function parameterized by a\nneural network, are a promising alternative to traditional discrete vector or\ngrid-based representations. Compared to discrete representations, neural\nrepresentations both scale well with increasing resolution, are continuous, and\ncan be many-times differentiable. However, given a dataset of signals that we\nwould like to represent, having to optimize a separate neural field for each\nsignal is inefficient, and cannot capitalize on shared information or\nstructures among signals. Existing generalization methods view this as a\nmeta-learning problem and employ gradient-based meta-learning to learn an\ninitialization which is then fine-tuned with test-time optimization, or learn\nhypernetworks to produce the weights of a neural field. We instead propose a\nnew paradigm that views the large-scale training of neural representations as a\npart of a partially-observed neural process framework, and leverage neural\nprocess algorithms to solve this task. We demonstrate that this approach\noutperforms both state-of-the-art gradient-based meta-learning approaches and\nhypernetwork approaches.\n","authors":["Jeffrey Gu","Kuan-Chieh Wang","Serena Yeung"],"pdf_url":"https://arxiv.org/pdf/2309.06660v1.pdf","comment":"To appear ICCV 2023"},{"id":"http://arxiv.org/abs/2203.14092v3","updated":"2023-09-13T01:18:40Z","published":"2022-03-26T14:31:35Z","title":"A large scale multi-view RGBD visual affordance learning dataset","summary":" The physical and textural attributes of objects have been widely studied for\nrecognition, detection and segmentation tasks in computer vision.~A number of\ndatasets, such as large scale ImageNet, have been proposed for feature learning\nusing data hungry deep neural networks and for hand-crafted feature extraction.\nTo intelligently interact with objects, robots and intelligent machines need\nthe ability to infer beyond the traditional physical/textural attributes, and\nunderstand/learn visual cues, called visual affordances, for affordance\nrecognition, detection and segmentation. To date there is no publicly available\nlarge dataset for visual affordance understanding and learning. In this paper,\nwe introduce a large scale multi-view RGBD visual affordance learning dataset,\na benchmark of 47210 RGBD images from 37 object categories, annotated with 15\nvisual affordance categories. To the best of our knowledge, this is the first\never and the largest multi-view RGBD visual affordance learning dataset. We\nbenchmark the proposed dataset for affordance segmentation and recognition\ntasks using popular Vision Transformer and Convolutional Neural Networks.\nSeveral state-of-the-art deep learning networks are evaluated each for\naffordance recognition and segmentation tasks. Our experimental results\nshowcase the challenging nature of the dataset and present definite prospects\nfor new and robust affordance learning algorithms. The dataset is publicly\navailable at https://sites.google.com/view/afaqshah/dataset.\n","authors":["Zeyad Khalifa","Syed Afaq Ali Shah"],"pdf_url":"https://arxiv.org/pdf/2203.14092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16203v3","updated":"2023-09-13T01:16:45Z","published":"2023-03-28T17:59:56Z","title":"Your Diffusion Model is Secretly a Zero-Shot Classifier","summary":" The recent wave of large-scale text-to-image diffusion models has\ndramatically increased our text-based image generation abilities. These models\ncan generate realistic images for a staggering variety of prompts and exhibit\nimpressive compositional generalization abilities. Almost all use cases thus\nfar have solely focused on sampling; however, diffusion models can also provide\nconditional density estimates, which are useful for tasks beyond image\ngeneration. In this paper, we show that the density estimates from large-scale\ntext-to-image diffusion models like Stable Diffusion can be leveraged to\nperform zero-shot classification without any additional training. Our\ngenerative approach to classification, which we call Diffusion Classifier,\nattains strong results on a variety of benchmarks and outperforms alternative\nmethods of extracting knowledge from diffusion models. Although a gap remains\nbetween generative and discriminative approaches on zero-shot recognition\ntasks, our diffusion-based approach has significantly stronger multimodal\ncompositional reasoning ability than competing discriminative approaches.\nFinally, we use Diffusion Classifier to extract standard classifiers from\nclass-conditional diffusion models trained on ImageNet. Our models achieve\nstrong classification performance using only weak augmentations and exhibit\nqualitatively better \"effective robustness\" to distribution shift. Overall, our\nresults are a step toward using generative over discriminative models for\ndownstream tasks. Results and visualizations at\nhttps://diffusion-classifier.github.io/\n","authors":["Alexander C. Li","Mihir Prabhudesai","Shivam Duggal","Ellis Brown","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2303.16203v3.pdf","comment":"In ICCV 2023. Website at https://diffusion-classifier.github.io/"},{"id":"http://arxiv.org/abs/2309.06062v2","updated":"2023-09-13T01:03:30Z","published":"2023-09-12T09:00:17Z","title":"Selection of contributing factors for predicting landslide\n susceptibility using machine learning and deep learning models","summary":" Landslides are a common natural disaster that can cause casualties, property\nsafety threats and economic losses. Therefore, it is important to understand or\npredict the probability of landslide occurrence at potentially risky sites. A\ncommonly used means is to carry out a landslide susceptibility assessment based\non a landslide inventory and a set of landslide contributing factors. This can\nbe readily achieved using machine learning (ML) models such as logistic\nregression (LR), support vector machine (SVM), random forest (RF), extreme\ngradient boosting (Xgboost), or deep learning (DL) models such as convolutional\nneural network (CNN) and long short time memory (LSTM). As the input data for\nthese models, landslide contributing factors have varying influences on\nlandslide occurrence. Therefore, it is logically feasible to select more\nimportant contributing factors and eliminate less relevant ones, with the aim\nof increasing the prediction accuracy of these models. However, selecting more\nimportant factors is still a challenging task and there is no generally\naccepted method. Furthermore, the effects of factor selection using various\nmethods on the prediction accuracy of ML and DL models are unclear. In this\nstudy, the impact of the selection of contributing factors on the accuracy of\nlandslide susceptibility predictions using ML and DL models was investigated.\nFour methods for selecting contributing factors were considered for all the\naforementioned ML and DL models, which included Information Gain Ratio (IGR),\nRecursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least\nAbsolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization\n(HHO). In addition, autoencoder-based factor selection methods for DL models\nwere also investigated. To assess their performances, an exhaustive approach\nwas adopted,...\n","authors":["Cheng Chen","Lei Fan"],"pdf_url":"https://arxiv.org/pdf/2309.06062v2.pdf","comment":"Stochastic Environmental Research and Risk Assessment"},{"id":"http://arxiv.org/abs/2309.06652v1","updated":"2023-09-13T00:38:59Z","published":"2023-09-13T00:38:59Z","title":"Event-Driven Imaging in Turbid Media: A Confluence of Optoelectronics\n and Neuromorphic Computation","summary":" In this paper a new optical-computational method is introduced to unveil\nimages of targets whose visibility is severely obscured by light scattering in\ndense, turbid media. The targets of interest are taken to be dynamic in that\ntheir optical properties are time-varying whether stationary in space or\nmoving. The scheme, to our knowledge the first of its kind, is human vision\ninspired whereby diffuse photons collected from the turbid medium are first\ntransformed to spike trains by a dynamic vision sensor as in the retina, and\nimage reconstruction is then performed by a neuromorphic computing approach\nmimicking the brain. We combine benchtop experimental data in both reflection\n(backscattering) and transmission geometries with support from physics-based\nsimulations to develop a neuromorphic computational model and then apply this\nfor image reconstruction of different MNIST characters and image sets by a\ndedicated deep spiking neural network algorithm. Image reconstruction is\nachieved under conditions of turbidity where an original image is\nunintelligible to the human eye or a digital video camera, yet clearly and\nquantifiable identifiable when using the new neuromorphic computational\napproach.\n","authors":["Ning Zhang","Timothy Shea","Arto Nurmikko"],"pdf_url":"https://arxiv.org/pdf/2309.06652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15021v2","updated":"2023-09-13T23:46:22Z","published":"2023-05-24T11:04:30Z","title":"EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought","summary":" Embodied AI is a crucial frontier in robotics, capable of planning and\nexecuting action sequences for robots to accomplish long-horizon tasks in\nphysical environments. In this work, we introduce EmbodiedGPT, an end-to-end\nmulti-modal foundation model for embodied AI, empowering embodied agents with\nmulti-modal understanding and execution capabilities. To achieve this, we have\nmade the following efforts: (i) We craft a large-scale embodied planning\ndataset, termed EgoCOT. The dataset consists of carefully selected videos from\nthe Ego4D dataset, along with corresponding high-quality language instructions.\nSpecifically, we generate a sequence of sub-goals with the \"Chain of Thoughts\"\nmode for effective embodied planning. (ii) We introduce an efficient training\napproach to EmbodiedGPT for high-quality plan generation, by adapting a 7B\nlarge language model (LLM) to the EgoCOT dataset via prefix tuning. (iii) We\nintroduce a paradigm for extracting task-related features from LLM-generated\nplanning queries to form a closed loop between high-level planning and\nlow-level control. Extensive experiments show the effectiveness of EmbodiedGPT\non embodied tasks, including embodied planning, embodied control, visual\ncaptioning, and visual question answering. Notably, EmbodiedGPT significantly\nenhances the success rate of the embodied control task by extracting more\neffective features. It has achieved a remarkable 1.6 times increase in success\nrate on the Franka Kitchen benchmark and a 1.3 times increase on the Meta-World\nbenchmark, compared to the BLIP-2 baseline fine-tuned with the Ego4D dataset.\n","authors":["Yao Mu","Qinglong Zhang","Mengkang Hu","Wenhai Wang","Mingyu Ding","Jun Jin","Bin Wang","Jifeng Dai","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2305.15021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07332v1","updated":"2023-09-13T22:04:50Z","published":"2023-09-13T22:04:50Z","title":"Reliability-based cleaning of noisy training labels with inductive\n conformal prediction in multi-modal biomedical data mining","summary":" Accurately labeling biomedical data presents a challenge. Traditional\nsemi-supervised learning methods often under-utilize available unlabeled data.\nTo address this, we propose a novel reliability-based training data cleaning\nmethod employing inductive conformal prediction (ICP). This method capitalizes\non a small set of accurately labeled training data and leverages ICP-calculated\nreliability metrics to rectify mislabeled data and outliers within vast\nquantities of noisy training data. The efficacy of the method is validated\nacross three classification tasks within distinct modalities: filtering\ndrug-induced-liver-injury (DILI) literature with title and abstract, predicting\nICU admission of COVID-19 patients through CT radiomics and electronic health\nrecords, and subtyping breast cancer using RNA-sequencing data. Varying levels\nof noise to the training labels were introduced through label permutation.\nResults show significant enhancements in classification performance: accuracy\nenhancement in 86 out of 96 DILI experiments (up to 11.4%), AUROC and AUPRC\nenhancements in all 48 COVID-19 experiments (up to 23.8% and 69.8%), and\naccuracy and macro-average F1 score improvements in 47 out of 48 RNA-sequencing\nexperiments (up to 74.6% and 89.0%). Our method offers the potential to\nsubstantially boost classification performance in multi-modal biomedical\nmachine learning tasks. Importantly, it accomplishes this without necessitating\nan excessive volume of meticulously curated training data.\n","authors":["Xianghao Zhan","Qinmei Xu","Yuanning Zheng","Guangming Lu","Olivier Gevaert"],"pdf_url":"https://arxiv.org/pdf/2309.07332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07330v1","updated":"2023-09-13T22:01:36Z","published":"2023-09-13T22:01:36Z","title":"Automated Assessment of Critical View of Safety in Laparoscopic\n Cholecystectomy","summary":" Cholecystectomy (gallbladder removal) is one of the most common procedures in\nthe US, with more than 1.2M procedures annually. Compared with classical open\ncholecystectomy, laparoscopic cholecystectomy (LC) is associated with\nsignificantly shorter recovery period, and hence is the preferred method.\nHowever, LC is also associated with an increase in bile duct injuries (BDIs),\nresulting in significant morbidity and mortality. The primary cause of BDIs\nfrom LCs is misidentification of the cystic duct with the bile duct. Critical\nview of safety (CVS) is the most effective of safety protocols, which is said\nto be achieved during the surgery if certain criteria are met. However, due to\nsuboptimal understanding and implementation of CVS, the BDI rates have remained\nstable over the last three decades. In this paper, we develop deep-learning\ntechniques to automate the assessment of CVS in LCs. An innovative aspect of\nour research is on developing specialized learning techniques by incorporating\ndomain knowledge to compensate for the limited training data available in\npractice. In particular, our CVS assessment process involves a fusion of two\nsegmentation maps followed by an estimation of a certain region of interest\nbased on anatomical structures close to the gallbladder, and then finally\ndetermination of each of the three CVS criteria via rule-based assessment of\nstructural information. We achieved a gain of over 11.8% in mIoU on relevant\nclasses with our two-stream semantic segmentation approach when compared to a\nsingle-model baseline, and 1.84% in mIoU with our proposed Sobel loss function\nwhen compared to a Transformer-based baseline model. For CVS criteria, we\nachieved up to 16% improvement and, for the overall CVS assessment, we achieved\n5% improvement in balanced accuracy compared to DeepCVS under the same\nexperiment settings.\n","authors":["Yunfan Li","Himanshu Gupta","Haibin Ling","IV Ramakrishnan","Prateek Prasanna","Georgios Georgakis","Aaron Sasson"],"pdf_url":"https://arxiv.org/pdf/2309.07330v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02561v2","updated":"2023-09-13T21:40:56Z","published":"2023-09-05T20:21:03Z","title":"Physically Grounded Vision-Language Models for Robotic Manipulation","summary":" Recent advances in vision-language models (VLMs) have led to improved\nperformance on tasks such as visual question answering and image captioning.\nConsequently, these models are now well-positioned to reason about the physical\nworld, particularly within domains such as robotic manipulation. However,\ncurrent VLMs are limited in their understanding of the physical concepts (e.g.,\nmaterial, fragility) of common objects, which restricts their usefulness for\nrobotic manipulation tasks that involve interaction and physical reasoning\nabout such objects. To address this limitation, we propose PhysObjects, an\nobject-centric dataset of 39.6K crowd-sourced and 417K automated physical\nconcept annotations of common household objects. We demonstrate that\nfine-tuning a VLM on PhysObjects improves its understanding of physical object\nconcepts, including generalization to held-out concepts, by capturing human\npriors of these concepts from visual appearance. We incorporate this\nphysically-grounded VLM in an interactive framework with a large language\nmodel-based robotic planner, and show improved planning performance on tasks\nthat require reasoning about physical object concepts, compared to baselines\nthat do not leverage physically-grounded VLMs. We additionally illustrate the\nbenefits of our physically-grounded VLM on a real robot, where it improves task\nsuccess rates. We release our dataset and provide further details and\nvisualizations of our results at https://iliad.stanford.edu/pg-vlm/.\n","authors":["Jensen Gao","Bidipta Sarkar","Fei Xia","Ted Xiao","Jiajun Wu","Brian Ichter","Anirudha Majumdar","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2309.02561v2.pdf","comment":"Updated generalization results on held-out concepts"},{"id":"http://arxiv.org/abs/2309.07322v1","updated":"2023-09-13T21:21:50Z","published":"2023-09-13T21:21:50Z","title":"$\\texttt{NePhi}$: Neural Deformation Fields for Approximately\n Diffeomorphic Medical Image Registration","summary":" This work proposes $\\texttt{NePhi}$, a neural deformation model which results\nin approximately diffeomorphic transformations. In contrast to the predominant\nvoxel-based approaches, $\\texttt{NePhi}$ represents deformations functionally\nwhich allows for memory-efficient training and inference. This is of particular\nimportance for large volumetric registrations. Further, while medical image\nregistration approaches representing transformation maps via multi-layer\nperceptrons have been proposed, $\\texttt{NePhi}$ facilitates both pairwise\noptimization-based registration $\\textit{as well as}$ learning-based\nregistration via predicted or optimized global and local latent codes. Lastly,\nas deformation regularity is a highly desirable property for most medical image\nregistration tasks, $\\texttt{NePhi}$ makes use of gradient inverse consistency\nregularization which empirically results in approximately diffeomorphic\ntransformations. We show the performance of $\\texttt{NePhi}$ on two 2D\nsynthetic datasets as well as on real 3D lung registration. Our results show\nthat $\\texttt{NePhi}$ can achieve similar accuracies as voxel-based\nrepresentations in a single-resolution registration setting while using less\nmemory and allowing for faster instance-optimization.\n","authors":["Lin Tian","Soumyadip Sengupta","Hastings Greer","Raúl San José Estépar","Marc Niethammer"],"pdf_url":"https://arxiv.org/pdf/2309.07322v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07297v1","updated":"2023-09-13T20:47:29Z","published":"2023-09-13T20:47:29Z","title":"Multi-Modal Hybrid Learning and Sequential Training for RGB-T Saliency\n Detection","summary":" RGB-T saliency detection has emerged as an important computer vision task,\nidentifying conspicuous objects in challenging scenes such as dark\nenvironments. However, existing methods neglect the characteristics of\ncross-modal features and rely solely on network structures to fuse RGB and\nthermal features. To address this, we first propose a Multi-Modal Hybrid loss\n(MMHL) that comprises supervised and self-supervised loss functions. The\nsupervised loss component of MMHL distinctly utilizes semantic features from\ndifferent modalities, while the self-supervised loss component reduces the\ndistance between RGB and thermal features. We further consider both spatial and\nchannel information during feature fusion and propose the Hybrid Fusion Module\nto effectively fuse RGB and thermal features. Lastly, instead of jointly\ntraining the network with cross-modal features, we implement a sequential\ntraining strategy which performs training only on RGB images in the first stage\nand then learns cross-modal features in the second stage. This training\nstrategy improves saliency detection performance without computational\noverhead. Results from performance evaluation and ablation studies demonstrate\nthe superior performance achieved by the proposed method compared with the\nexisting state-of-the-art methods.\n","authors":["Guangyu Ren","Jitesh Joshi","Youngjun Cho"],"pdf_url":"https://arxiv.org/pdf/2309.07297v1.pdf","comment":"8 Pages main text, 3 pages supplementary information, 12 figures"},{"id":"http://arxiv.org/abs/2211.02048v4","updated":"2023-09-13T20:32:50Z","published":"2022-11-03T17:59:55Z","title":"Efficient Spatially Sparse Inference for Conditional GANs and Diffusion\n Models","summary":" During image editing, existing deep generative models tend to re-synthesize\nthe entire output from scratch, including the unedited regions. This leads to a\nsignificant waste of computation, especially for minor editing operations. In\nthis work, we present Spatially Sparse Inference (SSI), a general-purpose\ntechnique that selectively performs computation for edited regions and\naccelerates various generative models, including both conditional GANs and\ndiffusion models. Our key observation is that users prone to gradually edit the\ninput image. This motivates us to cache and reuse the feature maps of the\noriginal image. Given an edited image, we sparsely apply the convolutional\nfilters to the edited regions while reusing the cached features for the\nunedited areas. Based on our algorithm, we further propose Sparse Incremental\nGenerative Engine (SIGE) to convert the computation reduction to latency\nreduction on off-the-shelf hardware. With about $1\\%$-area edits, SIGE\naccelerates DDPM by $3.0\\times$ on NVIDIA RTX 3090 and $4.6\\times$ on Apple M1\nPro GPU, Stable Diffusion by $7.2\\times$ on 3090, and GauGAN by $5.6\\times$ on\n3090 and $5.2\\times$ on M1 Pro GPU. Compared to our conference version, we\nextend SIGE to accommodate attention layers and apply it to Stable Diffusion.\nAdditionally, we offer support for Apple M1 Pro GPU and include more results\nwith large and sequential edits.\n","authors":["Muyang Li","Ji Lin","Chenlin Meng","Stefano Ermon","Song Han","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.02048v4.pdf","comment":"NeurIPS 2022 T-PAMI 2023 Website: https://www.cs.cmu.edu/~sige/ Code:\n https://github.com/lmxyy/sige"},{"id":"http://arxiv.org/abs/2309.07293v1","updated":"2023-09-13T20:28:54Z","published":"2023-09-13T20:28:54Z","title":"GAN-based Algorithm for Efficient Image Inpainting","summary":" Global pandemic due to the spread of COVID-19 has post challenges in a new\ndimension on facial recognition, where people start to wear masks. Under such\ncondition, the authors consider utilizing machine learning in image inpainting\nto tackle the problem, by complete the possible face that is originally covered\nin mask. In particular, autoencoder has great potential on retaining important,\ngeneral features of the image as well as the generative power of the generative\nadversarial network (GAN). The authors implement a combination of the two\nmodels, context encoders and explain how it combines the power of the two\nmodels and train the model with 50,000 images of influencers faces and yields a\nsolid result that still contains space for improvements. Furthermore, the\nauthors discuss some shortcomings with the model, their possible improvements,\nas well as some area of study for future investigation for applicative\nperspective, as well as directions to further enhance and refine the model.\n","authors":["Zhengyang Han","Zehao Jiang","Yuan Ju"],"pdf_url":"https://arxiv.org/pdf/2309.07293v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.07277v1","updated":"2023-09-13T19:33:26Z","published":"2023-09-13T19:33:26Z","title":"Unbiased Face Synthesis With Diffusion Models: Are We There Yet?","summary":" Text-to-image diffusion models have achieved widespread popularity due to\ntheir unprecedented image generation capability. In particular, their ability\nto synthesize and modify human faces has spurred research into using generated\nface images in both training data augmentation and model performance\nassessments. In this paper, we study the efficacy and shortcomings of\ngenerative models in the context of face generation. Utilizing a combination of\nqualitative and quantitative measures, including embedding-based metrics and\nuser studies, we present a framework to audit the characteristics of generated\nfaces conditioned on a set of social attributes. We applied our framework on\nfaces generated through state-of-the-art text-to-image diffusion models. We\nidentify several limitations of face image generation that include faithfulness\nto the text prompt, demographic disparities, and distributional shifts.\nFurthermore, we present an analytical model that provides insights into how\ntraining data selection contributes to the performance of generative models.\n","authors":["Harrison Rosenberg","Shimaa Ahmed","Guruprasad V Ramesh","Ramya Korlakai Vinayak","Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2309.07277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07268v1","updated":"2023-09-13T19:18:18Z","published":"2023-09-13T19:18:18Z","title":"So you think you can track?","summary":" This work introduces a multi-camera tracking dataset consisting of 234 hours\nof video data recorded concurrently from 234 overlapping HD cameras covering a\n4.2 mile stretch of 8-10 lane interstate highway near Nashville, TN. The video\nis recorded during a period of high traffic density with 500+ objects typically\nvisible within the scene and typical object longevities of 3-15 minutes. GPS\ntrajectories from 270 vehicle passes through the scene are manually corrected\nin the video data to provide a set of ground-truth trajectories for\nrecall-oriented tracking metrics, and object detections are provided for each\ncamera in the scene (159 million total before cross-camera fusion). Initial\nbenchmarking of tracking-by-detection algorithms is performed against the GPS\ntrajectories, and a best HOTA of only 9.5% is obtained (best recall 75.9% at\nIOU 0.1, 47.9 average IDs per ground truth object), indicating the benchmarked\ntrackers do not perform sufficiently well at the long temporal and spatial\ndurations required for traffic scene understanding.\n","authors":["Derek Gloudemans","Gergely Zachár","Yanbing Wang","Junyi Ji","Matt Nice","Matt Bunting","William Barbour","Jonathan Sprinkle","Benedetto Piccoli","Maria Laura Delle Monache","Alexandre Bayen","Benjamin Seibold","Daniel B. Work"],"pdf_url":"https://arxiv.org/pdf/2309.07268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07255v1","updated":"2023-09-13T18:43:14Z","published":"2023-09-13T18:43:14Z","title":"Automated segmentation of rheumatoid arthritis immunohistochemistry\n stained synovial tissue","summary":" Rheumatoid Arthritis (RA) is a chronic, autoimmune disease which primarily\naffects the joint's synovial tissue. It is a highly heterogeneous disease, with\nwide cellular and molecular variability observed in synovial tissues. Over the\nlast two decades, the methods available for their study have advanced\nconsiderably. In particular, Immunohistochemistry stains are well suited to\nhighlighting the functional organisation of samples. Yet, analysis of\nIHC-stained synovial tissue samples is still overwhelmingly done manually and\nsemi-quantitatively by expert pathologists. This is because in addition to the\nfragmented nature of IHC stained synovial tissue, there exist wide variations\nin intensity and colour, strong clinical centre batch effect, as well as the\npresence of many undesirable artefacts present in gigapixel Whole Slide Images\n(WSIs), such as water droplets, pen annotation, folded tissue, blurriness, etc.\nThere is therefore a strong need for a robust, repeatable automated tissue\nsegmentation algorithm which can cope with this variability and provide support\nto imaging pipelines. We train a UNET on a hand-curated, heterogeneous\nreal-world multi-centre clinical dataset R4RA, which contains multiple types of\nIHC staining. The model obtains a DICE score of 0.865 and successfully segments\ndifferent types of IHC staining, as well as dealing with variance in colours,\nintensity and common WSIs artefacts from the different clinical centres. It can\nbe used as the first step in an automated image analysis pipeline for synovial\ntissue samples stained with IHC, increasing speed, reproducibility and\nrobustness.\n","authors":["Amaya Gallagher-Syed","Abbas Khan","Felice Rivellese","Costantino Pitzalis","Myles J. Lewis","Gregory Slabaugh","Michael R. Barnes"],"pdf_url":"https://arxiv.org/pdf/2309.07255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07254v1","updated":"2023-09-13T18:43:13Z","published":"2023-09-13T18:43:13Z","title":"Mitigate Replication and Copying in Diffusion Models with Generalized\n Caption and Dual Fusion Enhancement","summary":" While diffusion models demonstrate a remarkable capability for generating\nhigh-quality images, their tendency to `replicate' training data raises privacy\nconcerns. Although recent research suggests that this replication may stem from\nthe insufficient generalization of training data captions and duplication of\ntraining images, effective mitigation strategies remain elusive. To address\nthis gap, our paper first introduces a generality score that measures the\ncaption generality and employ large language model (LLM) to generalize training\ncaptions. Subsequently, we leverage generalized captions and propose a novel\ndual fusion enhancement approach to mitigate the replication of diffusion\nmodels. Our empirical results demonstrate that our proposed methods can\nsignificantly reduce replication by 43.5% compared to the original diffusion\nmodel while maintaining the diversity and quality of generations.\n","authors":["Chenghao Li","Dake Chen","Yuke Zhang","Peter A. Beerel"],"pdf_url":"https://arxiv.org/pdf/2309.07254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07243v1","updated":"2023-09-13T18:28:04Z","published":"2023-09-13T18:28:04Z","title":"LInKs \"Lifting Independent Keypoints\" -- Partial Pose Lifting for\n Occlusion Handling with Improved Accuracy in 2D-3D Human Pose Estimation","summary":" We present LInKs, a novel unsupervised learning method to recover 3D human\nposes from 2D kinematic skeletons obtained from a single image, even when\nocclusions are present. Our approach follows a unique two-step process, which\ninvolves first lifting the occluded 2D pose to the 3D domain, followed by\nfilling in the occluded parts using the partially reconstructed 3D coordinates.\nThis lift-then-fill approach leads to significantly more accurate results\ncompared to models that complete the pose in 2D space alone. Additionally, we\nimprove the stability and likelihood estimation of normalising flows through a\ncustom sampling function replacing PCA dimensionality reduction previously used\nin prior work. Furthermore, we are the first to investigate if different parts\nof the 2D kinematic skeleton can be lifted independently which we find by\nitself reduces the error of current lifting approaches. We attribute this to\nthe reduction of long-range keypoint correlations. In our detailed evaluation,\nwe quantify the error under various realistic occlusion scenarios, showcasing\nthe versatility and applicability of our model. Our results consistently\ndemonstrate the superiority of handling all types of occlusions in 3D space\nwhen compared to others that complete the pose in 2D space. Our approach also\nexhibits consistent accuracy in scenarios without occlusion, as evidenced by a\n7.9% reduction in reconstruction error compared to prior works on the Human3.6M\ndataset. Furthermore, our method excels in accurately retrieving complete 3D\nposes even in the presence of occlusions, making it highly applicable in\nsituations where complete 2D pose information is unavailable.\n","authors":["Peter Hardy","Hansung Kim"],"pdf_url":"https://arxiv.org/pdf/2309.07243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.08790v5","updated":"2023-09-13T18:25:59Z","published":"2020-11-17T17:36:16Z","title":"P1AC: Revisiting Absolute Pose From a Single Affine Correspondence","summary":" Affine correspondences have traditionally been used to improve feature\nmatching over wide baselines. While recent work has successfully used affine\ncorrespondences to solve various relative camera pose estimation problems, less\nattention has been given to their use in absolute pose estimation. We introduce\nthe first general solution to the problem of estimating the pose of a\ncalibrated camera given a single observation of an oriented point and an affine\ncorrespondence. The advantage of our approach (P1AC) is that it requires only a\nsingle correspondence, in comparison to the traditional point-based approach\n(P3P), significantly reducing the combinatorics in robust estimation. P1AC\nprovides a general solution that removes restrictive assumptions made in prior\nwork and is applicable to large-scale image-based localization. We propose a\nminimal solution to the P1AC problem and evaluate our novel solver on synthetic\ndata, showing its numerical stability and performance under various types of\nnoise. On standard image-based localization benchmarks we show that P1AC\nachieves more accurate results than the widely used P3P algorithm. Code for our\nmethod is available at https://github.com/jonathanventura/P1AC/ .\n","authors":["Jonathan Ventura","Zuzana Kukelova","Torsten Sattler","Dániel Baráth"],"pdf_url":"https://arxiv.org/pdf/2011.08790v5.pdf","comment":"ICCV 2023 (with corrections in eqs. 6 and 13)"},{"id":"http://arxiv.org/abs/2304.14408v2","updated":"2023-09-13T18:24:00Z","published":"2023-03-16T17:30:51Z","title":"Autocharacterization: Automated and Scalable Semiconductor Property\n Estimation from High-throughput Experiments using Computer Vision","summary":" High-throughput materials synthesis methods have risen in popularity due to\ntheir potential to accelerate the design and discovery of novel functional\nmaterials, such as solution-processed semiconductors. After synthesis, key\nmaterial properties must be measured and characterized to validate discovery\nand provide feedback to optimization cycles. However, with the boom in\ndevelopment of high-throughput synthesis tools that champion production rates\nup to $10^4$ samples per hour with flexible form factors, most sample\ncharacterization methods are either slow (conventional rates of $10^1$ samples\nper hour, approximately 1000x slower) or rigid (e.g., designed for\nstandard-size microplates), resulting in a bottleneck that impedes the\nmaterials-design process. To overcome this challenge, we propose a set of\nautomated material property characterization (autocharacterization) tools that\nleverage the adaptive, parallelizable, and scalable nature of computer vision\nto accelerate the throughput of characterization by 85x compared to the\nnon-automated workflow. We demonstrate a generalizable composition mapping tool\nfor high-throughput synthesized binary material systems as well as two scalable\nautocharacterization algorithms that (1) autonomously compute the band gap of\n200 unique compositions in 6 minutes and (2) autonomously compute the degree of\ndegradation in 200 unique compositions in 20 minutes, generating ultra-high\ncompositional resolution trends of band gap and stability. We demonstrate that\nthe developed band gap and degradation detection autocharacterization methods\nachieve 98.5% accuracy and 96.9% accuracy, respectively, on the\nFA$_{1-x}$MA$_{x}$PbI$_3$, $0\\leq x \\leq 1$ perovskite semiconductor system.\n","authors":["Alexander E. Siemenn","Eunice Aissi","Fang Sheng","Armi Tiihonen","Hamide Kavak","Basita Das","Tonio Buonassisi"],"pdf_url":"https://arxiv.org/pdf/2304.14408v2.pdf","comment":"Manuscript 18 pages; Supplemental 20 pages"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.07015v1","updated":"2023-09-13T15:17:29Z","published":"2023-09-13T15:17:29Z","title":"Résumé Parsing as Hierarchical Sequence Labeling: An Empirical Study","summary":" Extracting information from r\\'esum\\'es is typically formulated as a\ntwo-stage problem, where the document is first segmented into sections and then\neach section is processed individually to extract the target entities. Instead,\nwe cast the whole problem as sequence labeling in two levels -- lines and\ntokens -- and study model architectures for solving both tasks simultaneously.\nWe build high-quality r\\'esum\\'e parsing corpora in English, French, Chinese,\nSpanish, German, Portuguese, and Swedish. Based on these corpora, we present\nexperimental results that demonstrate the effectiveness of the proposed models\nfor the information extraction task, outperforming approaches introduced in\nprevious work. We conduct an ablation study of the proposed architectures. We\nalso analyze both model performance and resource efficiency, and describe the\ntrade-offs for model deployment in the context of a production environment.\n","authors":["Federico Retyk","Hermenegildo Fabregat","Juan Aizpuru","Mariana Taglio","Rabih Zbib"],"pdf_url":"https://arxiv.org/pdf/2309.07015v1.pdf","comment":"RecSys in HR'23: The 3rd Workshop on Recommender Systems for Human\n Resources, in conjunction with the 17th ACM Conference on Recommender\n Systems, September 18--22, 2023, Singapore, Singapore"},{"id":"http://arxiv.org/abs/2309.06930v1","updated":"2023-09-13T13:03:44Z","published":"2023-09-13T13:03:44Z","title":"Modeling Dislocation Dynamics Data Using Semantic Web Technologies","summary":" Research in the field of Materials Science and Engineering focuses on the\ndesign, synthesis, properties, and performance of materials. An important class\nof materials that is widely investigated are crystalline materials, including\nmetals and semiconductors. Crystalline material typically contains a distinct\ntype of defect called \"dislocation\". This defect significantly affects various\nmaterial properties, including strength, fracture toughness, and ductility.\nResearchers have devoted a significant effort in recent years to understanding\ndislocation behavior through experimental characterization techniques and\nsimulations, e.g., dislocation dynamics simulations. This paper presents how\ndata from dislocation dynamics simulations can be modeled using semantic web\ntechnologies through annotating data with ontologies. We extend the already\nexisting Dislocation Ontology by adding missing concepts and aligning it with\ntwo other domain-related ontologies (i.e., the Elementary Multi-perspective\nMaterial Ontology and the Materials Design Ontology) allowing for representing\nthe dislocation simulation data efficiently. Moreover, we show a real-world use\ncase by representing the discrete dislocation dynamics data as a knowledge\ngraph (DisLocKG) that illustrates the relationship between them. We also\ndeveloped a SPARQL endpoint that brings extensive flexibility to query\nDisLocKG.\n","authors":["Ahmad Zainul Ihsan","Said Fathalla","Stefan Sandfeld"],"pdf_url":"https://arxiv.org/pdf/2309.06930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06912v1","updated":"2023-09-13T12:22:14Z","published":"2023-09-13T12:22:14Z","title":"Multi-behavior Recommendation with SVD Graph Neural Networks","summary":" Graph Neural Networks (GNNs) has been extensively employed in the field of\nrecommender systems, offering users personalized recommendations and yielding\nremarkable outcomes. Recently, GNNs incorporating contrastive learning have\ndemonstrated promising performance in handling sparse data problem of\nrecommendation system. However, existing contrastive learning methods still\nhave limitations in addressing the cold-start problem and resisting noise\ninterference especially for multi-behavior recommendation. To mitigate the\naforementioned issues, the present research posits a GNNs based multi-behavior\nrecommendation model MB-SVD that utilizes Singular Value Decomposition (SVD)\ngraphs to enhance model performance. In particular, MB-SVD considers user\npreferences under different behaviors, improving recommendation effectiveness\nwhile better addressing the cold-start problem. Our model introduces an\ninnovative methodology, which subsume multi-behavior contrastive learning\nparadigm to proficiently discern the intricate interconnections among\nheterogeneous manifestations of user behavior and generates SVD graphs to\nautomate the distillation of crucial multi-behavior self-supervised information\nfor robust graph augmentation. Furthermore, the SVD based framework reduces the\nembedding dimensions and computational load. Thorough experimentation showcases\nthe remarkable performance of our proposed MB-SVD approach in multi-behavior\nrecommendation endeavors across diverse real-world datasets.\n","authors":["Shengxi Fu","Qianqian Ren"],"pdf_url":"https://arxiv.org/pdf/2309.06912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06908v1","updated":"2023-09-13T12:10:54Z","published":"2023-09-13T12:10:54Z","title":"Towards the TopMost: A Topic Modeling System Toolkit","summary":" Topic models have been proposed for decades with various applications and\nrecently refreshed by the neural variational inference. However, these topic\nmodels adopt totally distinct dataset, implementation, and evaluation settings,\nwhich hinders their quick utilization and fair comparisons. This greatly\nhinders the research progress of topic models. To address these issues, in this\npaper we propose a Topic Modeling System Toolkit (TopMost). Compared to\nexisting toolkits, TopMost stands out by covering a wider range of topic\nmodeling scenarios including complete lifecycles with dataset pre-processing,\nmodel training, testing, and evaluations. The highly cohesive and decoupled\nmodular design of TopMost enables quick utilization, fair comparisons, and\nflexible extensions of different topic models. This can facilitate the research\nand applications of topic models. Our code, tutorials, and documentation are\navailable at https://github.com/bobxwu/topmost.\n","authors":["Xiaobao Wu","Fengjun Pan","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2309.06908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06882v1","updated":"2023-09-13T11:16:52Z","published":"2023-09-13T11:16:52Z","title":"ProMap: Datasets for Product Mapping in E-commerce","summary":" The goal of product mapping is to decide, whether two listings from two\ndifferent e-shops describe the same products. Existing datasets of matching and\nnon-matching pairs of products, however, often suffer from incomplete product\ninformation or contain only very distant non-matching products. Therefore,\nwhile predictive models trained on these datasets achieve good results on them,\nin practice, they are unusable as they cannot distinguish very similar but\nnon-matching pairs of products. This paper introduces two new datasets for\nproduct mapping: ProMapCz consisting of 1,495 Czech product pairs and ProMapEn\nconsisting of 1,555 English product pairs of matching and non-matching products\nmanually scraped from two pairs of e-shops. The datasets contain both images\nand textual descriptions of the products, including their specifications,\nmaking them one of the most complete datasets for product mapping.\nAdditionally, the non-matching products were selected in two phases, creating\ntwo types of non-matches -- close non-matches and medium non-matches. Even the\nmedium non-matches are pairs of products that are much more similar than\nnon-matches in other datasets -- for example, they still need to have the same\nbrand and similar name and price. After simple data preprocessing, several\nmachine learning algorithms were trained on these and two the other datasets to\ndemonstrate the complexity and completeness of ProMap datasets. ProMap datasets\nare presented as a golden standard for further research of product mapping\nfilling the gaps in existing ones.\n","authors":["Kateřina Macková","Martin Pilát"],"pdf_url":"https://arxiv.org/pdf/2309.06882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.02817v3","updated":"2023-09-13T09:00:48Z","published":"2023-01-07T09:36:35Z","title":"Cost-optimal Seeding Strategy During a Botanical Pandemic in\n Domesticated Fields","summary":" Context: Botanical pandemics cause enormous economic damage and food shortage\naround the globe. However, since botanical pandemics are here to stay in the\nshort-medium term, domesticated field owners can strategically seed their\nfields to optimize each session's economic profit. Objective: Given the\npathogen's epidemiological properties, we aim to find an economically optimal\ngrid-based seeding strategy for field owners and policymakers. Methods: We\npropose a novel epidemiological-economic mathematical model that describes the\neconomic profit from a field of plants during a botanical pandemic. We describe\nthe epidemiological dynamics using a spatio-temporal extended\nSusceptible-Infected-Recovered epidemiological model with a non-linear output\nepidemiological model. Results and Conclusions: We provide an algorithm to\nobtain an optimal grid-formed seeding strategy to maximize economic profit,\ngiven field and pathogen properties. In addition, we implement the proposed\nmodel in realistic settings, analyzing the sensitivity of the economic profit\nas a function of several epidemiological and economic properties. We show that\nthe recovery and basic infection rates have a similar economic influence.\nUnintuitively, we show that in the context of a botanic pandemic, a larger farm\ndoes not promise higher economic profit. Significance: Our results demonstrate\na significant benefit of using the proposed seeding strategy and shed more\nlight on the dynamics of the botanical pandemic in domesticated fields.\n","authors":["Teddy Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2301.02817v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06789v1","updated":"2023-09-13T08:22:56Z","published":"2023-09-13T08:22:56Z","title":"An Image Dataset for Benchmarking Recommender Systems with Raw Pixels","summary":" Recommender systems (RS) have achieved significant success by leveraging\nexplicit identification (ID) features. However, the full potential of content\nfeatures, especially the pure image pixel features, remains relatively\nunexplored. The limited availability of large, diverse, and content-driven\nimage recommendation datasets has hindered the use of raw images as item\nrepresentations. In this regard, we present PixelRec, a massive image-centric\nrecommendation dataset that includes approximately 200 million user-image\ninteractions, 30 million users, and 400,000 high-quality cover images. By\nproviding direct access to raw image pixels, PixelRec enables recommendation\nmodels to learn item representation directly from them. To demonstrate its\nutility, we begin by presenting the results of several classical pure ID-based\nbaseline models, termed IDNet, trained on PixelRec. Then, to show the\neffectiveness of the dataset's image features, we substitute the itemID\nembeddings (from IDNet) with a powerful vision encoder that represents items\nusing their raw image pixels. This new model is dubbed PixelNet.Our findings\nindicate that even in standard, non-cold start recommendation settings where\nIDNet is recognized as highly effective, PixelNet can already perform equally\nwell or even better than IDNet. Moreover, PixelNet has several other notable\nadvantages over IDNet, such as being more effective in cold-start and\ncross-domain recommendation scenarios. These results underscore the importance\nof visual features in PixelRec. We believe that PixelRec can serve as a\ncritical resource and testing ground for research on recommendation models that\nemphasize image pixel content. The dataset, code, and leaderboard will be\navailable at https://github.com/website-pixelrec/PixelRec.\n","authors":["Yu Cheng","Yunzhu Pan","Jiaqi Zhang","Yongxin Ni","Aixin Sun","Fajie Yuan"],"pdf_url":"https://arxiv.org/pdf/2309.06789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06748v1","updated":"2023-09-13T06:40:24Z","published":"2023-09-13T06:40:24Z","title":"CONVERSER: Few-Shot Conversational Dense Retrieval with Synthetic Data\n Generation","summary":" Conversational search provides a natural interface for information retrieval\n(IR). Recent approaches have demonstrated promising results in applying dense\nretrieval to conversational IR. However, training dense retrievers requires\nlarge amounts of in-domain paired data. This hinders the development of\nconversational dense retrievers, as abundant in-domain conversations are\nexpensive to collect. In this paper, we propose CONVERSER, a framework for\ntraining conversational dense retrievers with at most 6 examples of in-domain\ndialogues. Specifically, we utilize the in-context learning capability of large\nlanguage models to generate conversational queries given a passage in the\nretrieval corpus. Experimental results on conversational retrieval benchmarks\nOR-QuAC and TREC CAsT 19 show that the proposed CONVERSER achieves comparable\nperformance to fully-supervised models, demonstrating the effectiveness of our\nproposed framework in few-shot conversational dense retrieval. All source code\nand generated datasets are available at https://github.com/MiuLab/CONVERSER\n","authors":["Chao-Wei Huang","Chen-Yu Hsu","Tsu-Yuan Hsu","Chen-An Li","Yun-Nung Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06748v1.pdf","comment":"Accepted to SIGDIAL 2023"},{"id":"http://arxiv.org/abs/2309.06175v2","updated":"2023-09-13T03:53:43Z","published":"2023-09-12T12:37:37Z","title":"AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity\n Recognition and Linking","summary":" This paper presents a novel approach to address the Entity Recognition and\nLinking Challenge at NLPCC 2015. The task involves extracting named entity\nmentions from short search queries and linking them to entities within a\nreference Chinese knowledge base. To tackle this problem, we first expand the\nexisting knowledge base and utilize external knowledge to identify candidate\nentities, thereby improving the recall rate. Next, we extract features from the\ncandidate entities and utilize Support Vector Regression and Multiple Additive\nRegression Tree as scoring functions to filter the results. Additionally, we\napply rules to further refine the results and enhance precision. Our method is\ncomputationally efficient and achieves an F1 score of 0.535.\n","authors":["Di Lu","Zhongping Liang","Caixia Yuan","Xiaojie Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06175v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2309.07120v1","updated":"2023-09-13T17:57:21Z","published":"2023-09-13T17:57:21Z","title":"Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness\n and Ethics","summary":" Multi-modal large language models (MLLMs) are trained based on large language\nmodels (LLM), with an enhanced capability to comprehend multi-modal inputs and\ngenerate textual responses. While they excel in multi-modal tasks, the pure NLP\nabilities of MLLMs are often underestimated and left untested. In this study,\nwe get out of the box and unveil an intriguing characteristic of MLLMs -- our\npreliminary results suggest that visual instruction tuning, a prevailing\nstrategy for transitioning LLMs into MLLMs, unexpectedly and interestingly\nhelps models attain both improved truthfulness and ethical alignment in the\npure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model\nsurpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one\nmillion human annotations, on TruthfulQA-mc and Ethics benchmarks. Further\nanalysis reveals that the improved alignment can be attributed to the superior\ninstruction quality inherent to visual-text data. In releasing our code at\ngithub.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration\ninto the intrinsic value of visual-text synergies and, in a broader scope,\nmulti-modal interactions in alignment research.\n","authors":["Haoqin Tu","Bingchen Zhao","Chen Wei","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2309.07120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07117v1","updated":"2023-09-13T17:55:11Z","published":"2023-09-13T17:55:11Z","title":"PILOT: A Pre-Trained Model-Based Continual Learning Toolbox","summary":" While traditional machine learning can effectively tackle a wide range of\nproblems, it primarily operates within a closed-world setting, which presents\nlimitations when dealing with streaming data. As a solution, incremental\nlearning emerges to address real-world scenarios involving new data's arrival.\nRecently, pre-training has made significant advancements and garnered the\nattention of numerous researchers. The strong performance of these pre-trained\nmodels (PTMs) presents a promising avenue for developing continual learning\nalgorithms that can effectively adapt to real-world scenarios. Consequently,\nexploring the utilization of PTMs in incremental learning has become essential.\nThis paper introduces a pre-trained model-based continual learning toolbox\nknown as PILOT. On the one hand, PILOT implements some state-of-the-art\nclass-incremental learning algorithms based on pre-trained models, such as L2P,\nDualPrompt, and CODA-Prompt. On the other hand, PILOT also fits typical\nclass-incremental learning algorithms (e.g., DER, FOSTER, and MEMO) within the\ncontext of pre-trained models to evaluate their effectiveness.\n","authors":["Hai-Long Sun","Da-Wei Zhou","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.07117v1.pdf","comment":"Code is available at https://github.com/sun-hailong/LAMDA-PILOT"},{"id":"http://arxiv.org/abs/2309.07115v1","updated":"2023-09-13T17:45:41Z","published":"2023-09-13T17:45:41Z","title":"Weakly-Supervised Multi-Task Learning for Audio-Visual Speaker\n Verification","summary":" In this paper, we present a methodology for achieving robust multimodal\nperson representations optimized for open-set audio-visual speaker\nverification. Distance Metric Learning (DML) approaches have typically\ndominated this problem space, owing to strong performance on new and unseen\nclasses. In our work, we explored multitask learning techniques to further\nboost performance of the DML approach and show that an auxiliary task with weak\nlabels can increase the compactness of the learned speaker representation. We\nalso extend the Generalized end-to-end loss (GE2E) to multimodal inputs and\ndemonstrate that it can achieve competitive performance in an audio-visual\nspace. Finally, we introduce a non-synchronous audio-visual sampling random\nstrategy during training time that has shown to improve generalization. Our\nnetwork achieves state of the art performance for speaker verification,\nreporting 0.244%, 0.252%, 0.441% Equal Error Rate (EER) on the three official\ntrial lists of VoxCeleb1-O/E/H, which is to our knowledge, the best published\nresults on VoxCeleb1-E and VoxCeleb1-H.\n","authors":["Anith Selvakumar","Homa Fashandi"],"pdf_url":"https://arxiv.org/pdf/2309.07115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07113v1","updated":"2023-09-13T17:37:19Z","published":"2023-09-13T17:37:19Z","title":"Contrastive Deep Encoding Enables Uncertainty-aware\n Machine-learning-assisted Histopathology","summary":" Deep neural network models can learn clinically relevant features from\nmillions of histopathology images. However generating high-quality annotations\nto train such models for each hospital, each cancer type, and each diagnostic\ntask is prohibitively laborious. On the other hand, terabytes of training data\n-- while lacking reliable annotations -- are readily available in the public\ndomain in some cases. In this work, we explore how these large datasets can be\nconsciously utilized to pre-train deep networks to encode informative\nrepresentations. We then fine-tune our pre-trained models on a fraction of\nannotated training data to perform specific downstream tasks. We show that our\napproach can reach the state-of-the-art (SOTA) for patch-level classification\nwith only 1-10% randomly selected annotations compared to other SOTA\napproaches. Moreover, we propose an uncertainty-aware loss function, to\nquantify the model confidence during inference. Quantified uncertainty helps\nexperts select the best instances to label for further training. Our\nuncertainty-aware labeling reaches the SOTA with significantly fewer\nannotations compared to random labeling. Last, we demonstrate how our\npre-trained encoders can surpass current SOTA for whole-slide image\nclassification with weak supervision. Our work lays the foundation for data and\ntask-agnostic pre-trained deep networks with quantified uncertainty.\n","authors":["Nirhoshan Sivaroopan","Chamuditha Jayanga","Chalani Ekanayake","Hasindri Watawana","Jathurshan Pradeepkumar","Mithunjha Anandakumar","Ranga Rodrigo","Chamira U. S. Edussooriya","Dushan N. Wadduwage"],"pdf_url":"https://arxiv.org/pdf/2309.07113v1.pdf","comment":"18 pages, 8 figures"},{"id":"http://arxiv.org/abs/2306.03218v2","updated":"2023-09-13T17:36:51Z","published":"2023-06-05T20:08:19Z","title":"Optimal transport for automatic alignment of untargeted metabolomic data","summary":" Untargeted metabolomic profiling through liquid chromatography-mass\nspectrometry (LC-MS) measures a vast array of metabolites within biospecimens,\nadvancing drug development, disease diagnosis, and risk prediction. However,\nthe low throughput of LC-MS poses a major challenge for biomarker discovery,\nannotation, and experimental comparison, necessitating the merging of multiple\ndatasets. Current data pooling methods encounter practical limitations due to\ntheir vulnerability to data variations and hyperparameter dependence. Here we\nintroduce GromovMatcher, a flexible and user-friendly algorithm that\nautomatically combines LC-MS datasets using optimal transport. By capitalizing\non feature intensity correlation structures, GromovMatcher delivers superior\nalignment accuracy and robustness compared to existing approaches. This\nalgorithm scales to thousands of features requiring minimal hyperparameter\ntuning. Applying our method to experimental patient studies of liver and\npancreatic cancer, we discover shared metabolic features related to patient\nalcohol intake, demonstrating how GromovMatcher facilitates the search for\nbiomarkers associated with lifestyle risk factors linked to several cancer\ntypes.\n","authors":["Marie Breeur","George Stepaniants","Pekka Keski-Rahkonen","Philippe Rigollet","Vivian Viallon"],"pdf_url":"https://arxiv.org/pdf/2306.03218v2.pdf","comment":"43 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.07110v1","updated":"2023-09-13T17:32:21Z","published":"2023-09-13T17:32:21Z","title":"Data Augmentation via Subgroup Mixup for Improving Fairness","summary":" In this work, we propose data augmentation via pairwise mixup across\nsubgroups to improve group fairness. Many real-world applications of machine\nlearning systems exhibit biases across certain groups due to\nunder-representation or training data that reflects societal biases. Inspired\nby the successes of mixup for improving classification performance, we develop\na pairwise mixup scheme to augment training data and encourage fair and\naccurate decision boundaries for all subgroups. Data augmentation for group\nfairness allows us to add new samples of underrepresented groups to balance\nsubpopulations. Furthermore, our method allows us to use the generalization\nability of mixup to improve both fairness and accuracy. We compare our proposed\nmixup to existing data augmentation and bias mitigation approaches on both\nsynthetic simulations and real-world benchmark fair classification data,\ndemonstrating that we are able to achieve fair outcomes with robust if not\nimproved accuracy.\n","authors":["Madeline Navarro","Camille Little","Genevera I. Allen","Santiago Segarra"],"pdf_url":"https://arxiv.org/pdf/2309.07110v1.pdf","comment":"5 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2309.07108v1","updated":"2023-09-13T17:26:36Z","published":"2023-09-13T17:26:36Z","title":"Characterizing Speed Performance of Multi-Agent Reinforcement Learning","summary":" Multi-Agent Reinforcement Learning (MARL) has achieved significant success in\nlarge-scale AI systems and big-data applications such as smart grids,\nsurveillance, etc. Existing advancements in MARL algorithms focus on improving\nthe rewards obtained by introducing various mechanisms for inter-agent\ncooperation. However, these optimizations are usually compute- and\nmemory-intensive, thus leading to suboptimal speed performance in end-to-end\ntraining time. In this work, we analyze the speed performance (i.e.,\nlatency-bounded throughput) as the key metric in MARL implementations.\nSpecifically, we first introduce a taxonomy of MARL algorithms from an\nacceleration perspective categorized by (1) training scheme and (2)\ncommunication method. Using our taxonomy, we identify three state-of-the-art\nMARL algorithms - Multi-Agent Deep Deterministic Policy Gradient (MADDPG),\nTarget-oriented Multi-agent Communication and Cooperation (ToM2C), and\nNetworked Multi-Agent RL (NeurComm) - as target benchmark algorithms, and\nprovide a systematic analysis of their performance bottlenecks on a homogeneous\nmulti-core CPU platform. We justify the need for MARL latency-bounded\nthroughput to be a key performance metric in future literature while also\naddressing opportunities for parallelization and acceleration.\n","authors":["Samuel Wiggins","Yuan Meng","Rajgopal Kannan","Viktor Prasanna"],"pdf_url":"https://arxiv.org/pdf/2309.07108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.03202v5","updated":"2023-09-13T17:08:02Z","published":"2022-05-06T13:29:14Z","title":"Perseus: A Simple and Optimal High-Order Method for Variational\n Inequalities","summary":" This paper settles an open and challenging question pertaining to the design\nof simple and optimal high-order methods for solving smooth and monotone\nvariational inequalities (VIs). A VI involves finding $x^\\star \\in \\mathcal{X}$\nsuch that $\\langle F(x), x - x^\\star\\rangle \\geq 0$ for all $x \\in\n\\mathcal{X}$. We consider the setting in which $F$ is smooth with up to\n$(p-1)^{th}$-order derivatives. For $p = 2$, the cubic regularized Newton\nmethod was extended to VIs with a global rate of $O(\\epsilon^{-1})$. An\nimproved rate of $O(\\epsilon^{-2/3}\\log\\log(1/\\epsilon))$ can be obtained via\nan alternative second-order method, but this method requires a nontrivial\nline-search procedure as an inner loop. Similarly, high-order methods based on\nline-search procedures have been shown to achieve a rate of\n$O(\\epsilon^{-2/(p+1)}\\log\\log(1/\\epsilon))$. As emphasized by Nesterov,\nhowever, such procedures do not necessarily imply practical applicability in\nlarge-scale applications, and it would be desirable to complement these results\nwith a simple high-order VI method that retains the optimality of the more\ncomplex methods. We propose a $p^{th}$-order method that does \\textit{not}\nrequire any line search procedure and provably converges to a weak solution at\na rate of $O(\\epsilon^{-2/(p+1)})$. We prove that our $p^{th}$-order method is\noptimal in the monotone setting by establishing a matching lower bound under a\ngeneralized linear span assumption. Our method with restarting attains a linear\nrate for smooth and strictly monotone VIs and a local superlinear rate for\nsmooth and strongly monotone VIs. Our method also achieves a global rate of\n$O(\\epsilon^{-2/p})$ for solving smooth and nonmonotone VIs satisfying the\nMinty condition and when augmented with restarting it attains a global linear\nand local superlinear rate for smooth and nonmonotone VIs satisfying the\nstrictly/strong Minty condition.\n","authors":["Tianyi Lin","Michael. I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2205.03202v5.pdf","comment":"Improve the paper significantly; 40 pages"},{"id":"http://arxiv.org/abs/2309.07085v1","updated":"2023-09-13T16:53:48Z","published":"2023-09-13T16:53:48Z","title":"Mitigating Group Bias in Federated Learning for Heterogeneous Devices","summary":" Federated Learning is emerging as a privacy-preserving model training\napproach in distributed edge applications. As such, most edge deployments are\nheterogeneous in nature i.e., their sensing capabilities and environments vary\nacross deployments. This edge heterogeneity violates the independence and\nidentical distribution (IID) property of local data across clients and produces\nbiased global models i.e. models that contribute to unfair decision-making and\ndiscrimination against a particular community or a group. Existing bias\nmitigation techniques only focus on bias generated from label heterogeneity in\nnon-IID data without accounting for domain variations due to feature\nheterogeneity and do not address global group-fairness property.\n Our work proposes a group-fair FL framework that minimizes group-bias while\npreserving privacy and without resource utilization overhead. Our main idea is\nto leverage average conditional probabilities to compute a cross-domain group\n\\textit{importance weights} derived from heterogeneous training data to\noptimize the performance of the worst-performing group using a modified\nmultiplicative weights update method. Additionally, we propose regularization\ntechniques to minimize the difference between the worst and best-performing\ngroups while making sure through our thresholding mechanism to strike a balance\nbetween bias reduction and group performance degradation. Our evaluation of\nhuman emotion recognition and image classification benchmarks assesses the fair\ndecision-making of our framework in real-world heterogeneous settings.\n","authors":["Khotso Selialia","Yasra Chandio","Fatima M. Anwar"],"pdf_url":"https://arxiv.org/pdf/2309.07085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05519v2","updated":"2023-09-13T16:49:34Z","published":"2023-09-11T15:02:25Z","title":"NExT-GPT: Any-to-Any Multimodal LLM","summary":" While recently Multimodal Large Language Models (MM-LLMs) have made exciting\nstrides, they mostly fall prey to the limitation of only input-side multimodal\nunderstanding, without the ability to produce content in multiple modalities.\nAs we humans always perceive the world and communicate with people through\nvarious modalities, developing any-to-any MM-LLMs capable of accepting and\ndelivering content in any modality becomes essential to human-level AI. To fill\nthe gap, we present an end-to-end general-purpose any-to-any MM-LLM system,\nNExT-GPT. We connect an LLM with multimodal adaptors and different diffusion\ndecoders, enabling NExT-GPT to perceive inputs and generate outputs in\narbitrary combinations of text, images, videos, and audio. By leveraging the\nexisting well-trained highly-performing encoders and decoders, NExT-GPT is\ntuned with only a small amount of parameter (1%) of certain projection layers,\nwhich not only benefits low-cost training and also facilitates convenient\nexpansion to more potential modalities. Moreover, we introduce a\nmodality-switching instruction tuning (MosIT) and manually curate a\nhigh-quality dataset for MosIT, based on which NExT-GPT is empowered with\ncomplex cross-modal semantic understanding and content generation. Overall, our\nresearch showcases the promising possibility of building an AI agent capable of\nmodeling universal modalities, paving the way for more human-like AI research\nin the community. Project page: https://next-gpt.github.io/\n","authors":["Shengqiong Wu","Hao Fei","Leigang Qu","Wei Ji","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2309.05519v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2302.09656v3","updated":"2023-09-13T16:40:35Z","published":"2023-02-19T19:03:26Z","title":"Imprecise Bayesian Neural Networks","summary":" Uncertainty quantification and robustness to distribution shifts are\nimportant goals in machine learning and artificial intelligence. Although\nBayesian Neural Networks (BNNs) allow for uncertainty in the predictions to be\nassessed, different sources of uncertainty are indistinguishable. We present\nImprecise Bayesian Neural Networks (IBNNs); they generalize and overcome some\nof the drawbacks of standard BNNs. These latter are trained using a single\nprior and likelihood distributions, whereas IBNNs are trained using credal\nprior and likelihood sets. They allow to distinguish between aleatoric and\nepistemic uncertainties, and to quantify them. In addition, IBNNs are more\nrobust than BNNs to prior and likelihood misspecification, and to distribution\nshift. They can also be used to compute sets of outcomes that enjoy\nprobabilistic guarantees. We apply IBNNs to two case studies. One, for motion\nprediction in autonomous driving scenarios, and two, to model blood glucose and\ninsulin dynamics for artificial pancreas control. We show that IBNNs performs\nbetter when compared to an ensemble of BNNs benchmark.\n","authors":["Michele Caprio","Souradeep Dutta","Kuk Jin Jang","Vivian Lin","Radoslav Ivanov","Oleg Sokolsky","Insup Lee"],"pdf_url":"https://arxiv.org/pdf/2302.09656v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07072v1","updated":"2023-09-13T16:33:27Z","published":"2023-09-13T16:33:27Z","title":"The Boundaries of Verifiable Accuracy, Robustness, and Generalisation in\n Deep Learning","summary":" In this work, we assess the theoretical limitations of determining guaranteed\nstability and accuracy of neural networks in classification tasks. We consider\nclassical distribution-agnostic framework and algorithms minimising empirical\nrisks and potentially subjected to some weights regularisation. We show that\nthere is a large family of tasks for which computing and verifying ideal stable\nand accurate neural networks in the above settings is extremely challenging, if\nat all possible, even when such ideal solutions exist within the given class of\nneural architectures.\n","authors":["Alexander Bastounis","Alexander N. Gorban","Anders C. Hansen","Desmond J. Higham","Danil Prokhorov","Oliver Sutton","Ivan Y. Tyukin","Qinghua Zhou"],"pdf_url":"https://arxiv.org/pdf/2309.07072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06851v2","updated":"2023-09-13T16:21:31Z","published":"2023-08-13T22:03:35Z","title":"Optimizing Offensive Gameplan in the National Basketball Association\n with Machine Learning","summary":" Throughout the analytical revolution that has occurred in the NBA, the\ndevelopment of specific metrics and formulas has given teams, coaches, and\nplayers a new way to see the game. However - the question arises - how can we\nverify any metrics? One method would simply be eyeball approximation (trying\nout many different gameplans) and/or trial and error - an estimation-based and\ncostly approach. Another approach is to try to model already existing metrics\nwith a unique set of features using machine learning techniques. The key to\nthis approach is that with these features that are selected, we can try to\ngauge the effectiveness of these features combined, rather than using\nindividual analysis in simple metric evaluation. If we have an accurate model,\nit can particularly help us determine the specifics of gameplan execution. In\nthis paper, the statistic ORTG (Offensive Rating, developed by Dean Oliver) was\nfound to have a correlation with different NBA playtypes using both a linear\nregression model and a neural network regression model, although ultimately, a\nneural network worked slightly better than linear regression. Using the\naccuracy of the models as a justification, the next step was to optimize the\noutput of the model with test examples, which would demonstrate the combination\nof features to best achieve a highly functioning offense.\n","authors":["Eamon Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2308.06851v2.pdf","comment":"6 pages, 4 figures. Revision: Corrected text and citation formatting\n issues"},{"id":"http://arxiv.org/abs/2303.00028v5","updated":"2023-09-13T16:21:22Z","published":"2023-02-28T19:10:12Z","title":"Efficient Sensor Placement from Regression with Sparse Gaussian\n Processes in Continuous and Discrete Spaces","summary":" The sensor placement problem is a common problem that arises when monitoring\ncorrelated phenomena, such as temperature and precipitation. Existing\napproaches to this problem typically use discrete optimization methods, which\nare computationally expensive and cannot scale to large problems. We address\nthe sensor placement problem in correlated environments by reducing it to a\nregression problem that can be efficiently solved using sparse Gaussian\nprocesses (SGPs). Our approach can handle both discrete sensor placement\nproblems-where sensors are limited to a subset of a given set of locations-and\ncontinuous sensor placement problems-where sensors can be placed anywhere in a\nbounded continuous region. Our experimental results on three real-world\ndatasets show that our approach generates sensor placements that result in\nreconstruction quality that is consistently on par or better than the prior\nstate-of-the-art approach while being significantly faster. Our computationally\nefficient approach enables both large-scale sensor placement and fast robotic\nsensor placement for informative path planning algorithms.\n","authors":["Kalvik Jakkala","Srinivas Akella"],"pdf_url":"https://arxiv.org/pdf/2303.00028v5.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2309.07056v1","updated":"2023-09-13T16:13:54Z","published":"2023-09-13T16:13:54Z","title":"Deep Quantum Graph Dreaming: Deciphering Neural Network Insights into\n Quantum Experiments","summary":" Despite their promise to facilitate new scientific discoveries, the\nopaqueness of neural networks presents a challenge in interpreting the logic\nbehind their findings. Here, we use a eXplainable-AI (XAI) technique called\n$inception$ or $deep$ $dreaming$, which has been invented in machine learning\nfor computer vision. We use this techniques to explore what neural networks\nlearn about quantum optics experiments. Our story begins by training a deep\nneural networks on the properties of quantum systems. Once trained, we \"invert\"\nthe neural network -- effectively asking how it imagines a quantum system with\na specific property, and how it would continuously modify the quantum system to\nchange a property. We find that the network can shift the initial distribution\nof properties of the quantum system, and we can conceptualize the learned\nstrategies of the neural network. Interestingly, we find that, in the first\nlayers, the neural network identifies simple properties, while in the deeper\nones, it can identify complex quantum structures and even quantum entanglement.\nThis is in reminiscence of long-understood properties known in computer vision,\nwhich we now identify in a complex natural science task. Our approach could be\nuseful in a more interpretable way to develop new advanced AI-based scientific\ndiscovery techniques in quantum physics.\n","authors":["Tareq Jaouni","Sören Arlt","Carlos Ruiz-Gonzalez","Ebrahim Karimi","Xuemei Gu","Mario Krenn"],"pdf_url":"https://arxiv.org/pdf/2309.07056v1.pdf","comment":"10 pages, 6 figures. Comments welcome!"},{"id":"http://arxiv.org/abs/2304.11075v2","updated":"2023-09-13T16:12:56Z","published":"2023-04-20T14:42:54Z","title":"Spaiche: Extending State-of-the-Art ASR Models to Swiss German Dialects","summary":" Recent breakthroughs in NLP largely increased the presence of ASR systems in\nour daily lives. However, for many low-resource languages, ASR models still\nneed to be improved due in part to the difficulty of acquiring pertinent data.\nThis project aims to help advance research in ASR models for Swiss German\ndialects, by providing insights about the performance of state-of-the-art ASR\nmodels on recently published Swiss German speech datasets. We propose a novel\nloss that takes into account the semantic distance between the predicted and\nthe ground-truth labels. We outperform current state-of-the-art results by\nfine-tuning OpenAI's Whisper model on Swiss-German datasets.\n","authors":["Clement Sicard","Kajetan Pyszkowski","Victor Gillioz"],"pdf_url":"https://arxiv.org/pdf/2304.11075v2.pdf","comment":"8 pages, SwissText conference"},{"id":"http://arxiv.org/abs/2309.00997v2","updated":"2023-09-13T16:07:28Z","published":"2023-09-02T17:48:42Z","title":"Switch and Conquer: Efficient Algorithms By Switching Stochastic\n Gradient Oracles For Decentralized Saddle Point Problems","summary":" We consider a class of non-smooth strongly convex-strongly concave saddle\npoint problems in a decentralized setting without a central server. To solve a\nconsensus formulation of problems in this class, we develop an inexact primal\ndual hybrid gradient (inexact PDHG) procedure that allows generic gradient\ncomputation oracles to update the primal and dual variables. We first\ninvestigate the performance of inexact PDHG with stochastic variance reduction\ngradient (SVRG) oracle. Our numerical study uncovers a significant phenomenon\nof initial conservative progress of iterates of IPDHG with SVRG oracle. To\ntackle this, we develop a simple and effective switching idea, where a\ngeneralized stochastic gradient (GSG) computation oracle is employed to hasten\nthe iterates' progress to a saddle point solution during the initial phase of\nupdates, followed by a switch to the SVRG oracle at an appropriate juncture.\nThe proposed algorithm is named Decentralized Proximal Switching Stochastic\nGradient method with Compression (C-DPSSG), and is proven to converge to an\n$\\epsilon$-accurate saddle point solution with linear rate. Apart from\ndelivering highly accurate solutions, our study reveals that utilizing the best\nconvergence phases of GSG and SVRG oracles makes C-DPSSG well suited for\nobtaining solutions of low/medium accuracy faster, useful for certain\napplications. Numerical experiments on two benchmark machine learning\napplications show C-DPSSG's competitive performance which validate our\ntheoretical findings. The codes used in the experiments can be found\n\\href{https://github.com/chhavisharma123/C-DPSSG-CDC2023}{here}.\n","authors":["Chhavi Sharma","Vishnu Narayanan","P. Balamurugan"],"pdf_url":"https://arxiv.org/pdf/2309.00997v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2205.14452"},{"id":"http://arxiv.org/abs/2309.07049v1","updated":"2023-09-13T15:59:02Z","published":"2023-09-13T15:59:02Z","title":"An Extreme Learning Machine-Based Method for Computational PDEs in\n Higher Dimensions","summary":" We present two effective methods for solving high-dimensional partial\ndifferential equations (PDE) based on randomized neural networks. Motivated by\nthe universal approximation property of this type of networks, both methods\nextend the extreme learning machine (ELM) approach from low to high dimensions.\nWith the first method the unknown solution field in $d$ dimensions is\nrepresented by a randomized feed-forward neural network, in which the\nhidden-layer parameters are randomly assigned and fixed while the output-layer\nparameters are trained. The PDE and the boundary/initial conditions, as well as\nthe continuity conditions (for the local variant of the method), are enforced\non a set of random interior/boundary collocation points. The resultant linear\nor nonlinear algebraic system, through its least squares solution, provides the\ntrained values for the network parameters. With the second method the\nhigh-dimensional PDE problem is reformulated through a constrained expression\nbased on an Approximate variant of the Theory of Functional Connections\n(A-TFC), which avoids the exponential growth in the number of terms of TFC as\nthe dimension increases. The free field function in the A-TFC constrained\nexpression is represented by a randomized neural network and is trained by a\nprocedure analogous to the first method. We present ample numerical simulations\nfor a number of high-dimensional linear/nonlinear stationary/dynamic PDEs to\ndemonstrate their performance. These methods can produce accurate solutions to\nhigh-dimensional PDEs, in particular with their errors reaching levels not far\nfrom the machine accuracy for relatively lower dimensions. Compared with the\nphysics-informed neural network (PINN) method, the current method is both\ncost-effective and more accurate for high-dimensional PDEs.\n","authors":["Yiran Wang","Suchuan Dong"],"pdf_url":"https://arxiv.org/pdf/2309.07049v1.pdf","comment":"38 pages, 17 tables, 25 figures"},{"id":"http://arxiv.org/abs/2309.02422v2","updated":"2023-09-13T15:56:07Z","published":"2023-09-05T17:51:00Z","title":"Maximum Mean Discrepancy Meets Neural Networks: The\n Radon-Kolmogorov-Smirnov Test","summary":" Maximum mean discrepancy (MMD) refers to a general class of nonparametric\ntwo-sample tests that are based on maximizing the mean difference over samples\nfrom one distribution $P$ versus another $Q$, over all choices of data\ntransformations $f$ living in some function space $\\mathcal{F}$. Inspired by\nrecent work that connects what are known as functions of $\\textit{Radon bounded\nvariation}$ (RBV) and neural networks (Parhi and Nowak, 2021, 2023), we study\nthe MMD defined by taking $\\mathcal{F}$ to be the unit ball in the RBV space of\na given smoothness order $k \\geq 0$. This test, which we refer to as the\n$\\textit{Radon-Kolmogorov-Smirnov}$ (RKS) test, can be viewed as a\ngeneralization of the well-known and classical Kolmogorov-Smirnov (KS) test to\nmultiple dimensions and higher orders of smoothness. It is also intimately\nconnected to neural networks: we prove that the witness in the RKS test -- the\nfunction $f$ achieving the maximum mean difference -- is always a ridge spline\nof degree $k$, i.e., a single neuron in a neural network. This allows us to\nleverage the power of modern deep learning toolkits to (approximately) optimize\nthe criterion that underlies the RKS test. We prove that the RKS test has\nasymptotically full power at distinguishing any distinct pair $P \\not= Q$ of\ndistributions, derive its asymptotic null distribution, and carry out extensive\nexperiments to elucidate the strengths and weakenesses of the RKS test versus\nthe more traditional kernel MMD test.\n","authors":["Seunghoon Paik","Michael Celentano","Alden Green","Ryan J. Tibshirani"],"pdf_url":"https://arxiv.org/pdf/2309.02422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07030v1","updated":"2023-09-13T15:36:39Z","published":"2023-09-13T15:36:39Z","title":"Optimal transport distances for directed, weighted graphs: a case study\n with cell-cell communication networks","summary":" Comparing graphs of optimal transport has recently gained significant\nattention, as the distances induced by optimal transport provide both a\nprincipled metric between graphs as well as an interpretable description of the\nassociated changes between graphs in terms of a transport plan. As the lack of\nsymmetry introduces challenges in the typically considered formulations,\noptimal transport distances for graphs have mostly been developed for\nundirected graphs. Here, we propose two distance measures to compare directed\ngraphs based on variants of optimal transport: (i) an earth movers distance\n(Wasserstein) and (ii) a Gromov-Wasserstein (GW) distance. We evaluate these\ntwo distances and discuss their relative performance for both simulated graph\ndata and real-world directed cell-cell communication graphs, inferred from\nsingle-cell RNA-seq data.\n","authors":["James S. Nagai","Ivan G. Costa","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2309.07030v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2212.01378v2","updated":"2023-09-13T15:07:01Z","published":"2022-12-02T18:59:04Z","title":"ColD Fusion: Collaborative Descent for Distributed Multitask Finetuning","summary":" We propose a new paradigm to continually evolve pretrained models, denoted\nColD Fusion. It provides the benefits of multitask learning but leverages\ndistributed computation with limited communication and eliminates the need for\nshared data. Consequentially, ColD Fusion can give rise to a synergistic loop,\nwhere finetuned models can be recycled to continually improve the pretrained\nmodel they are based upon. We show that ColD Fusion yields comparable benefits\nto multitask training by producing a model that (a) attains strong performance\non all of the datasets it was trained on; and (b) is a better starting point\nfor finetuning on unseen datasets. We show that ColD Fusion outperforms RoBERTa\nand even previous multitask models. Specifically, when training and testing on\n35 diverse datasets, ColD Fusion-based model outperforms RoBERTa by 2.33 points\non average without any changes to the architecture.\n","authors":["Shachar Don-Yehiya","Elad Venezian","Colin Raffel","Noam Slonim","Yoav Katz","Leshem Choshen"],"pdf_url":"https://arxiv.org/pdf/2212.01378v2.pdf","comment":"ACL 23"},{"id":"http://arxiv.org/abs/2305.06695v3","updated":"2023-09-13T14:39:32Z","published":"2023-05-11T10:04:27Z","title":"Deep Visual-Genetic Biometrics for Taxonomic Classification of Rare\n Species","summary":" Visual as well as genetic biometrics are routinely employed to identify\nspecies and individuals in biological applications. However, no attempts have\nbeen made in this domain to computationally enhance visual classification of\nrare classes with little image data via genetics. In this paper, we thus\npropose aligned visual-genetic inference spaces with the aim to implicitly\nencode cross-domain associations for improved performance. We demonstrate for\nthe first time that such alignment can be achieved via deep embedding models\nand that the approach is directly applicable to boosting long-tailed\nrecognition (LTR) particularly for rare species. We experimentally demonstrate\nthe efficacy of the concept via application to microscopic imagery of 30k+\nplanktic foraminifer shells across 32 species when used together with\nindependent genetic data samples. Most importantly for practitioners, we show\nthat visual-genetic alignment can significantly benefit visual-only recognition\nof the rarest species. Technically, we pre-train a visual ResNet50 deep\nlearning model using triplet loss formulations to create an initial embedding\nspace. We re-structure this space based on genetic anchors embedded via a\nSequence Graph Transform (SGT) and linked to visual data by cross-domain cosine\nalignment. We show that an LTR approach improves the state-of-the-art across\nall benchmarks and that adding our visual-genetic alignment improves per-class\nand particularly rare tail class benchmarks significantly further. We conclude\nthat visual-genetic alignment can be a highly effective tool for complementing\nvisual biological data containing rare classes. The concept proposed may serve\nas an important future tool for integrating genetics and imageomics towards a\nmore complete scientific representation of taxonomic spaces and life itself.\nCode, weights, and data splits are published for full reproducibility.\n","authors":["Tayfun Karaderi","Tilo Burghardt","Raphael Morard","Daniela Schmidt"],"pdf_url":"https://arxiv.org/pdf/2305.06695v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06991v1","updated":"2023-09-13T14:36:26Z","published":"2023-09-13T14:36:26Z","title":"Unsupervised Contrast-Consistent Ranking with Language Models","summary":" Language models contain ranking-based knowledge and are powerful solvers of\nin-context ranking tasks. For instance, they may have parametric knowledge\nabout the ordering of countries by size or may be able to rank reviews by\nsentiment. Recent work focuses on pairwise, pointwise, and listwise prompting\ntechniques to elicit a language model's ranking knowledge. However, we find\nthat even with careful calibration and constrained decoding, prompting-based\ntechniques may not always be self-consistent in the rankings they produce. This\nmotivates us to explore an alternative approach that is inspired by an\nunsupervised probing method called Contrast-Consistent Search (CCS). The idea\nis to train a probing model guided by a logical constraint: a model's\nrepresentation of a statement and its negation must be mapped to contrastive\ntrue-false poles consistently across multiple statements. We hypothesize that\nsimilar constraints apply to ranking tasks where all items are related via\nconsistent pairwise or listwise comparisons. To this end, we extend the binary\nCCS method to Contrast-Consistent Ranking (CCR) by adapting existing ranking\nmethods such as the Max-Margin Loss, Triplet Loss, and Ordinal Regression\nobjective. Our results confirm that, for the same language model, CCR probing\noutperforms prompting and even performs on a par with prompting much larger\nlanguage models.\n","authors":["Niklas Stoehr","Pengxiang Cheng","Jing Wang","Daniel Preotiuc-Pietro","Rajarshi Bhowmik"],"pdf_url":"https://arxiv.org/pdf/2309.06991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06981v1","updated":"2023-09-13T14:15:54Z","published":"2023-09-13T14:15:54Z","title":"MASTERKEY: Practical Backdoor Attack Against Speaker Verification\n Systems","summary":" Speaker Verification (SV) is widely deployed in mobile systems to\nauthenticate legitimate users by using their voice traits. In this work, we\npropose a backdoor attack MASTERKEY, to compromise the SV models. Different\nfrom previous attacks, we focus on a real-world practical setting where the\nattacker possesses no knowledge of the intended victim. To design MASTERKEY, we\ninvestigate the limitation of existing poisoning attacks against unseen\ntargets. Then, we optimize a universal backdoor that is capable of attacking\narbitrary targets. Next, we embed the speaker's characteristics and semantics\ninformation into the backdoor, making it imperceptible. Finally, we estimate\nthe channel distortion and integrate it into the backdoor. We validate our\nattack on 6 popular SV models. Specifically, we poison a total of 53 models and\nuse our trigger to attack 16,430 enrolled speakers, composed of 310 target\nspeakers enrolled in 53 poisoned models. Our attack achieves 100% attack\nsuccess rate with a 15% poison rate. By decreasing the poison rate to 3%, the\nattack success rate remains around 50%. We validate our attack in 3 real-world\nscenarios and successfully demonstrate the attack through both over-the-air and\nover-the-telephony-line scenarios.\n","authors":["Hanqing Guo","Xun Chen","Junfeng Guo","Li Xiao","Qiben Yan"],"pdf_url":"https://arxiv.org/pdf/2309.06981v1.pdf","comment":"Accepted by Mobicom 2023"},{"id":"http://arxiv.org/abs/2309.06979v1","updated":"2023-09-13T14:15:03Z","published":"2023-09-13T14:15:03Z","title":"Auto-Regressive Next-Token Predictors are Universal Learners","summary":" Large language models display remarkable capabilities in logical and\nmathematical reasoning, allowing them to solve complex tasks. Interestingly,\nthese abilities emerge in networks trained on the simple task of next-token\nprediction. In this work, we present a theoretical framework for studying\nauto-regressive next-token predictors. We demonstrate that even simple models\nsuch as linear next-token predictors, trained on Chain-of-Thought (CoT) data,\ncan approximate any function efficiently computed by a Turing machine. We\nintroduce a new complexity measure -- length complexity -- which measures the\nnumber of intermediate tokens in a CoT sequence required to approximate some\ntarget function, and analyze the interplay between length complexity and other\nnotions of complexity. Finally, we show experimentally that simple next-token\npredictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs),\ndisplay non-trivial performance on text generation and arithmetic tasks. Our\nresults demonstrate that the power of language models can be attributed, to a\ngreat extent, to the auto-regressive next-token training scheme, and not\nnecessarily to a particular choice of architecture.\n","authors":["Eran Malach"],"pdf_url":"https://arxiv.org/pdf/2309.06979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06973v1","updated":"2023-09-13T14:05:50Z","published":"2023-09-13T14:05:50Z","title":"DNNShifter: An Efficient DNN Pruning System for Edge Computing","summary":" Deep neural networks (DNNs) underpin many machine learning applications.\nProduction quality DNN models achieve high inference accuracy by training\nmillions of DNN parameters which has a significant resource footprint. This\npresents a challenge for resources operating at the extreme edge of the\nnetwork, such as mobile and embedded devices that have limited computational\nand memory resources. To address this, models are pruned to create lightweight,\nmore suitable variants for these devices. Existing pruning methods are unable\nto provide similar quality models compared to their unpruned counterparts\nwithout significant time costs and overheads or are limited to offline use\ncases. Our work rapidly derives suitable model variants while maintaining the\naccuracy of the original model. The model variants can be swapped quickly when\nsystem and network conditions change to match workload demand. This paper\npresents DNNShifter, an end-to-end DNN training, spatial pruning, and model\nswitching system that addresses the challenges mentioned above. At the heart of\nDNNShifter is a novel methodology that prunes sparse models using structured\npruning. The pruned model variants generated by DNNShifter are smaller in size\nand thus faster than dense and sparse model predecessors, making them suitable\nfor inference at the edge while retaining near similar accuracy as of the\noriginal dense model. DNNShifter generates a portfolio of model variants that\ncan be swiftly interchanged depending on operational conditions. DNNShifter\nproduces pruned model variants up to 93x faster than conventional training\nmethods. Compared to sparse models, the pruned model variants are up to 5.14x\nsmaller and have a 1.67x inference latency speedup, with no compromise to\nsparse model accuracy. In addition, DNNShifter has up to 11.9x lower overhead\nfor switching models and up to 3.8x lower memory utilisation than existing\napproaches.\n","authors":["Bailey J. Eccles","Philip Rodgers","Peter Kilpatrick","Ivor Spence","Blesson Varghese"],"pdf_url":"https://arxiv.org/pdf/2309.06973v1.pdf","comment":"14 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2309.06969v1","updated":"2023-09-13T14:04:15Z","published":"2023-09-13T14:04:15Z","title":"Setting the Right Expectations: Algorithmic Recourse Over Time","summary":" Algorithmic systems are often called upon to assist in high-stakes decision\nmaking. In light of this, algorithmic recourse, the principle wherein\nindividuals should be able to take action against an undesirable outcome made\nby an algorithmic system, is receiving growing attention. The bulk of the\nliterature on algorithmic recourse to-date focuses primarily on how to provide\nrecourse to a single individual, overlooking a critical element: the effects of\na continuously changing context. Disregarding these effects on recourse is a\nsignificant oversight, since, in almost all cases, recourse consists of an\nindividual making a first, unfavorable attempt, and then being given an\nopportunity to make one or several attempts at a later date - when the context\nmight have changed. This can create false expectations, as initial recourse\nrecommendations may become less reliable over time due to model drift and\ncompetition for access to the favorable outcome between individuals.\n In this work we propose an agent-based simulation framework for studying the\neffects of a continuously changing environment on algorithmic recourse. In\nparticular, we identify two main effects that can alter the reliability of\nrecourse for individuals represented by the agents: (1) competition with other\nagents acting upon recourse, and (2) competition with new agents entering the\nenvironment. Our findings highlight that only a small set of specific\nparameterizations result in algorithmic recourse that is reliable for agents\nover time. Consequently, we argue that substantial additional work is needed to\nunderstand recourse reliability over time, and to develop recourse methods that\nreward agents' effort.\n","authors":["Joao Fonseca","Andrew Bell","Carlo Abrate","Francesco Bonchi","Julia Stoyanovich"],"pdf_url":"https://arxiv.org/pdf/2309.06969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03231v3","updated":"2023-09-13T14:00:55Z","published":"2022-11-06T22:38:13Z","title":"A Spectral Analysis of Graph Neural Networks on Dense and Sparse Graphs","summary":" In this work we propose a random graph model that can produce graphs at\ndifferent levels of sparsity. We analyze how sparsity affects the graph\nspectra, and thus the performance of graph neural networks (GNNs) in node\nclassification on dense and sparse graphs. We compare GNNs with spectral\nmethods known to provide consistent estimators for community detection on dense\ngraphs, a closely related task. We show that GNNs can outperform spectral\nmethods on sparse graphs, and illustrate these results with numerical examples\non both synthetic and real graphs.\n","authors":["Luana Ruiz","Ningyuan Huang","Soledad Villar"],"pdf_url":"https://arxiv.org/pdf/2211.03231v3.pdf","comment":"Extended version of ICASSP 2024 submission"},{"id":"http://arxiv.org/abs/2309.06956v1","updated":"2023-09-13T13:42:52Z","published":"2023-09-13T13:42:52Z","title":"Implicit Neural Multiple Description for DNA-based data storage","summary":" DNA exhibits remarkable potential as a data storage solution due to its\nimpressive storage density and long-term stability, stemming from its inherent\nbiomolecular structure. However, developing this novel medium comes with its\nown set of challenges, particularly in addressing errors arising from storage\nand biological manipulations. These challenges are further conditioned by the\nstructural constraints of DNA sequences and cost considerations. In response to\nthese limitations, we have pioneered a novel compression scheme and a\ncutting-edge Multiple Description Coding (MDC) technique utilizing neural\nnetworks for DNA data storage. Our MDC method introduces an innovative approach\nto encoding data into DNA, specifically designed to withstand errors\neffectively. Notably, our new compression scheme overperforms classic image\ncompression methods for DNA-data storage. Furthermore, our approach exhibits\nsuperiority over conventional MDC methods reliant on auto-encoders. Its\ndistinctive strengths lie in its ability to bypass the need for extensive model\ntraining and its enhanced adaptability for fine-tuning redundancy levels.\nExperimental results demonstrate that our solution competes favorably with the\nlatest DNA data storage methods in the field, offering superior compression\nrates and robust noise resilience.\n","authors":["Trung Hieu Le","Xavier Pic","Jeremy Mateos","Marc Antonini"],"pdf_url":"https://arxiv.org/pdf/2309.06956v1.pdf","comment":"Xavier Pic and Trung Hieu Le are both equal contributors and primary\n authors"},{"id":"http://arxiv.org/abs/2309.06943v1","updated":"2023-09-13T13:26:10Z","published":"2023-09-13T13:26:10Z","title":"Effect of hyperparameters on variable selection in random forests","summary":" Random forests (RFs) are well suited for prediction modeling and variable\nselection in high-dimensional omics studies. The effect of hyperparameters of\nthe RF algorithm on prediction performance and variable importance estimation\nhave previously been investigated. However, how hyperparameters impact RF-based\nvariable selection remains unclear. We evaluate the effects on the Vita and the\nBoruta variable selection procedures based on two simulation studies utilizing\ntheoretical distributions and empirical gene expression data. We assess the\nability of the procedures to select important variables (sensitivity) while\ncontrolling the false discovery rate (FDR). Our results show that the\nproportion of splitting candidate variables (mtry.prop) and the sample fraction\n(sample.fraction) for the training dataset influence the selection procedures\nmore than the drawing strategy of the training datasets and the minimal\nterminal node size. A suitable setting of the RF hyperparameters depends on the\ncorrelation structure in the data. For weakly correlated predictor variables,\nthe default value of mtry is optimal, but smaller values of sample.fraction\nresult in larger sensitivity. In contrast, the difference in sensitivity of the\noptimal compared to the default value of sample.fraction is negligible for\nstrongly correlated predictor variables, whereas smaller values than the\ndefault are better in the other settings. In conclusion, the default values of\nthe hyperparameters will not always be suitable for identifying important\nvariables. Thus, adequate values differ depending on whether the aim of the\nstudy is optimizing prediction performance or variable selection.\n","authors":["Cesaire J. K. Fouodo","Lea L. Kronziel","Inke R. König","Silke Szymczak"],"pdf_url":"https://arxiv.org/pdf/2309.06943v1.pdf","comment":"18 pages, 2 figures + 2 figures in appendix, 3 tables"},{"id":"http://arxiv.org/abs/2309.06938v1","updated":"2023-09-13T13:20:17Z","published":"2023-09-13T13:20:17Z","title":"Collectionless Artificial Intelligence","summary":" By and large, the professional handling of huge data collections is regarded\nas a fundamental ingredient of the progress of machine learning and of its\nspectacular results in related disciplines, with a growing agreement on risks\nconnected to the centralization of such data collections. This paper sustains\nthe position that the time has come for thinking of new learning protocols\nwhere machines conquer cognitive skills in a truly human-like context centered\non environmental interactions. This comes with specific restrictions on the\nlearning protocol according to the collectionless principle, which states that,\nat each time instant, data acquired from the environment is processed with the\npurpose of contributing to update the current internal representation of the\nenvironment, and that the agent is not given the privilege of recording the\ntemporal stream. Basically, there is neither permission to store the temporal\ninformation coming from the sensors, thus promoting the development of\nself-organized memorization skills at a more abstract level, instead of relying\non bare storage to simulate learning dynamics that are typical of offline\nlearning algorithms. This purposely extreme position is intended to stimulate\nthe development of machines that learn to dynamically organize the information\nby following human-based schemes. The proposition of this challenge suggests\ndeveloping new foundations on computational processes of learning and reasoning\nthat might open the doors to a truly orthogonal competitive track on AI\ntechnologies that avoid data accumulation by design, thus offering a framework\nwhich is better suited concerning privacy issues, control and customizability.\nFinally, pushing towards massively distributed computation, the collectionless\napproach to AI will likely reduce the concentration of power in companies and\ngovernments, thus better facing geopolitical issues.\n","authors":["Marco Gori","Stefano Melacci"],"pdf_url":"https://arxiv.org/pdf/2309.06938v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06930v1","updated":"2023-09-13T13:03:44Z","published":"2023-09-13T13:03:44Z","title":"Modeling Dislocation Dynamics Data Using Semantic Web Technologies","summary":" Research in the field of Materials Science and Engineering focuses on the\ndesign, synthesis, properties, and performance of materials. An important class\nof materials that is widely investigated are crystalline materials, including\nmetals and semiconductors. Crystalline material typically contains a distinct\ntype of defect called \"dislocation\". This defect significantly affects various\nmaterial properties, including strength, fracture toughness, and ductility.\nResearchers have devoted a significant effort in recent years to understanding\ndislocation behavior through experimental characterization techniques and\nsimulations, e.g., dislocation dynamics simulations. This paper presents how\ndata from dislocation dynamics simulations can be modeled using semantic web\ntechnologies through annotating data with ontologies. We extend the already\nexisting Dislocation Ontology by adding missing concepts and aligning it with\ntwo other domain-related ontologies (i.e., the Elementary Multi-perspective\nMaterial Ontology and the Materials Design Ontology) allowing for representing\nthe dislocation simulation data efficiently. Moreover, we show a real-world use\ncase by representing the discrete dislocation dynamics data as a knowledge\ngraph (DisLocKG) that illustrates the relationship between them. We also\ndeveloped a SPARQL endpoint that brings extensive flexibility to query\nDisLocKG.\n","authors":["Ahmad Zainul Ihsan","Said Fathalla","Stefan Sandfeld"],"pdf_url":"https://arxiv.org/pdf/2309.06930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06921v1","updated":"2023-09-13T12:41:45Z","published":"2023-09-13T12:41:45Z","title":"Investigating the Impact of Action Representations in Policy Gradient\n Algorithms","summary":" Reinforcement learning~(RL) is a versatile framework for learning to solve\ncomplex real-world tasks. However, influences on the learning performance of RL\nalgorithms are often poorly understood in practice. We discuss different\nanalysis techniques and assess their effectiveness for investigating the impact\nof action representations in RL. Our experiments demonstrate that the action\nrepresentation can significantly influence the learning performance on popular\nRL benchmark tasks. The analysis results indicate that some of the performance\ndifferences can be attributed to changes in the complexity of the optimization\nlandscape. Finally, we discuss open challenges of analysis techniques for RL\nalgorithms.\n","authors":["Jan Schneider","Pierre Schumacher","Daniel Häufle","Bernhard Schölkopf","Dieter Büchler"],"pdf_url":"https://arxiv.org/pdf/2309.06921v1.pdf","comment":"Published at the Workshop on effective Representations, Abstractions,\n and Priors for Robot Learning (RAP4Robots) at ICRA 2023"},{"id":"http://arxiv.org/abs/2309.06917v1","updated":"2023-09-13T12:30:03Z","published":"2023-09-13T12:30:03Z","title":"Continual Learning with Dirichlet Generative-based Rehearsal","summary":" Recent advancements in data-driven task-oriented dialogue systems (ToDs)\nstruggle with incremental learning due to computational constraints and\ntime-consuming issues. Continual Learning (CL) attempts to solve this by\navoiding intensive pre-training, but it faces the problem of catastrophic\nforgetting (CF). While generative-based rehearsal CL methods have made\nsignificant strides, generating pseudo samples that accurately reflect the\nunderlying task-specific distribution is still a challenge. In this paper, we\npresent Dirichlet Continual Learning (DCL), a novel generative-based rehearsal\nstrategy for CL. Unlike the traditionally used Gaussian latent variable in the\nConditional Variational Autoencoder (CVAE), DCL leverages the flexibility and\nversatility of the Dirichlet distribution to model the latent prior variable.\nThis enables it to efficiently capture sentence-level features of previous\ntasks and effectively guide the generation of pseudo samples. In addition, we\nintroduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based\nknowledge distillation method that enhances knowledge transfer during pseudo\nsample generation. Our experiments confirm the efficacy of our approach in both\nintent detection and slot-filling tasks, outperforming state-of-the-art\nmethods.\n","authors":["Min Zeng","Wei Xue","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2309.06917v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04339v2","updated":"2023-09-13T12:22:11Z","published":"2023-09-08T14:08:19Z","title":"Online Submodular Maximization via Online Convex Optimization","summary":" We study monotone submodular maximization under general matroid constraints\nin the online setting. We prove that online optimization of a large class of\nsubmodular functions, namely, weighted threshold potential functions, reduces\nto online convex optimization (OCO). This is precisely because functions in\nthis class admit a concave relaxation; as a result, OCO policies, coupled with\nan appropriate rounding scheme, can be used to achieve sublinear regret in the\ncombinatorial setting. We show that our reduction extends to many different\nversions of the online learning problem, including the dynamic regret, bandit,\nand optimistic-learning settings.\n","authors":["Tareq Si-Salem","Gözde Özcan","Iasonas Nikolaou","Evimaria Terzi","Stratis Ioannidis"],"pdf_url":"https://arxiv.org/pdf/2309.04339v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2106.16239v7","updated":"2023-09-13T12:12:58Z","published":"2021-06-30T17:49:55Z","title":"Fixed points of nonnegative neural networks","summary":" We use fixed point theory to analyze nonnegative neural networks, which we\ndefine as neural networks that map nonnegative vectors to nonnegative vectors.\nWe first show that nonnegative neural networks with nonnegative weights and\nbiases can be recognized as monotonic and (weakly) scalable functions within\nthe framework of nonlinear Perron-Frobenius theory. This fact enables us to\nprovide conditions for the existence of fixed points of nonnegative neural\nnetworks having inputs and outputs of the same dimension, and these conditions\nare weaker than those recently obtained using arguments in convex analysis.\nFurthermore, we prove that the shape of the fixed point set of nonnegative\nneural networks with nonnegative weights and biases is an interval, which under\nmild conditions degenerates to a point. These results are then used to obtain\nthe existence of fixed points of more general nonnegative neural networks. From\na practical perspective, our results contribute to the understanding of the\nbehavior of autoencoders, and the main theoretical results are verified in\nnumerical simulations using the Modified National Institute of Standards and\nTechnology (MNIST) dataset.\n","authors":["Tomasz J. Piotrowski","Renato L. G. Cavalcante","Mateusz Gabor"],"pdf_url":"https://arxiv.org/pdf/2106.16239v7.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2309.05077v2","updated":"2023-09-13T12:12:50Z","published":"2023-09-10T16:55:59Z","title":"Generalization error bounds for iterative learning algorithms with\n bounded updates","summary":" This paper explores the generalization characteristics of iterative learning\nalgorithms with bounded updates for non-convex loss functions, employing\ninformation-theoretic techniques. Our key contribution is a novel bound for the\ngeneralization error of these algorithms with bounded updates, extending beyond\nthe scope of previous works that only focused on Stochastic Gradient Descent\n(SGD). Our approach introduces two main novelties: 1) we reformulate the mutual\ninformation as the uncertainty of updates, providing a new perspective, and 2)\ninstead of using the chaining rule of mutual information, we employ a variance\ndecomposition technique to decompose information across iterations, allowing\nfor a simpler surrogate process. We analyze our generalization bound under\nvarious settings and demonstrate improved bounds when the model dimension\nincreases at the same rate as the number of training data samples. To bridge\nthe gap between theory and practice, we also examine the previously observed\nscaling behavior in large language models. Ultimately, our work takes a further\nstep for developing practical generalization theories.\n","authors":["Jingwen Fu","Nanning Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.05077v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06908v1","updated":"2023-09-13T12:10:54Z","published":"2023-09-13T12:10:54Z","title":"Towards the TopMost: A Topic Modeling System Toolkit","summary":" Topic models have been proposed for decades with various applications and\nrecently refreshed by the neural variational inference. However, these topic\nmodels adopt totally distinct dataset, implementation, and evaluation settings,\nwhich hinders their quick utilization and fair comparisons. This greatly\nhinders the research progress of topic models. To address these issues, in this\npaper we propose a Topic Modeling System Toolkit (TopMost). Compared to\nexisting toolkits, TopMost stands out by covering a wider range of topic\nmodeling scenarios including complete lifecycles with dataset pre-processing,\nmodel training, testing, and evaluations. The highly cohesive and decoupled\nmodular design of TopMost enables quick utilization, fair comparisons, and\nflexible extensions of different topic models. This can facilitate the research\nand applications of topic models. Our code, tutorials, and documentation are\navailable at https://github.com/bobxwu/topmost.\n","authors":["Xiaobao Wu","Fengjun Pan","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2309.06908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12510v4","updated":"2023-09-13T11:49:30Z","published":"2023-07-24T03:52:11Z","title":"An Empirical Evaluation of Temporal Graph Benchmark","summary":" In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark\n(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with\nTGB, we include eleven popular dynamic graph learning methods for more\nexhaustive comparisons. Through the experiments, we find that (1) different\nmodels depict varying performance across various datasets, which is in line\nwith previous observations; (2) the performance of some baselines can be\nsignificantly improved over the reported results in TGB when using DyGLib. This\nwork aims to ease the researchers' efforts in evaluating various dynamic graph\nlearning methods on TGB and attempts to offer results that can be directly\nreferenced in the follow-up research. All the used resources in this project\nare publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is\nin progress, and feedback from the community is welcomed for improvements.\n","authors":["Le Yu"],"pdf_url":"https://arxiv.org/pdf/2307.12510v4.pdf","comment":"in progress, more results are added"},{"id":"http://arxiv.org/abs/2305.12143v2","updated":"2023-09-13T11:49:29Z","published":"2023-05-20T09:01:33Z","title":"Learning Horn Envelopes via Queries from Large Language Models","summary":" We investigate an approach for extracting knowledge from trained neural\nnetworks based on Angluin's exact learning model with membership and\nequivalence queries to an oracle. In this approach, the oracle is a trained\nneural network. We consider Angluin's classical algorithm for learning Horn\ntheories and study the necessary changes to make it applicable to learn from\nneural networks. In particular, we have to consider that trained neural\nnetworks may not behave as Horn oracles, meaning that their underlying target\ntheory may not be Horn. We propose a new algorithm that aims at extracting the\n\"tightest Horn approximation\" of the target theory and that is guaranteed to\nterminate in exponential time (in the worst case) and in polynomial time if the\ntarget has polynomially many non-Horn examples. To showcase the applicability\nof the approach, we perform experiments on pre-trained language models and\nextract rules that expose occupation-based gender biases.\n","authors":["Sophie Blum","Raoul Koudijs","Ana Ozaki","Samia Touileb"],"pdf_url":"https://arxiv.org/pdf/2305.12143v2.pdf","comment":"35 pages, 2 figures; manuscript accepted for publication in the\n International Journal of Approximate Reasoning (IJAR)"},{"id":"http://arxiv.org/abs/2309.06896v1","updated":"2023-09-13T11:45:21Z","published":"2023-09-13T11:45:21Z","title":"Domain-Aware Augmentations for Unsupervised Online General Continual\n Learning","summary":" Continual Learning has been challenging, especially when dealing with\nunsupervised scenarios such as Unsupervised Online General Continual Learning\n(UOGCL), where the learning agent has no prior knowledge of class boundaries or\ntask change information. While previous research has focused on reducing\nforgetting in supervised setups, recent studies have shown that self-supervised\nlearners are more resilient to forgetting. This paper proposes a novel approach\nthat enhances memory usage for contrastive learning in UOGCL by defining and\nusing stream-dependent data augmentations together with some implementation\ntricks. Our proposed method is simple yet effective, achieves state-of-the-art\nresults compared to other unsupervised approaches in all considered setups, and\nreduces the gap between supervised and unsupervised continual learning. Our\ndomain-aware augmentation procedure can be adapted to other replay-based\nmethods, making it a promising strategy for continual learning.\n","authors":["Nicolas Michel","Romain Negrel","Giovanni Chierchia","Jean-François Bercher"],"pdf_url":"https://arxiv.org/pdf/2309.06896v1.pdf","comment":"Accepted to BMVC'23"},{"id":"http://arxiv.org/abs/2309.06895v1","updated":"2023-09-13T11:37:04Z","published":"2023-09-13T11:37:04Z","title":"MagiCapture: High-Resolution Multi-Concept Portrait Customization","summary":" Large-scale text-to-image models including Stable Diffusion are capable of\ngenerating high-fidelity photorealistic portrait images. There is an active\nresearch area dedicated to personalizing these models, aiming to synthesize\nspecific subjects or styles using provided sets of reference images. However,\ndespite the plausible results from these personalization methods, they tend to\nproduce images that often fall short of realism and are not yet on a\ncommercially viable level. This is particularly noticeable in portrait image\ngeneration, where any unnatural artifact in human faces is easily discernible\ndue to our inherent human bias. To address this, we introduce MagiCapture, a\npersonalization method for integrating subject and style concepts to generate\nhigh-resolution portrait images using just a few subject and style references.\nFor instance, given a handful of random selfies, our fine-tuned model can\ngenerate high-quality portrait images in specific styles, such as passport or\nprofile photos. The main challenge with this task is the absence of ground\ntruth for the composed concepts, leading to a reduction in the quality of the\nfinal output and an identity shift of the source subject. To address these\nissues, we present a novel Attention Refocusing loss coupled with auxiliary\npriors, both of which facilitate robust learning within this weakly supervised\nlearning setting. Our pipeline also includes additional post-processing steps\nto ensure the creation of highly realistic outputs. MagiCapture outperforms\nother baselines in both quantitative and qualitative evaluations and can also\nbe generalized to other non-human objects.\n","authors":["Junha Hyung","Jaeyo Shin","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2309.06895v1.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.06891v1","updated":"2023-09-13T11:28:27Z","published":"2023-09-13T11:28:27Z","title":"Keep It SimPool: Who Said Supervised Transformers Suffer from Attention\n Deficit?","summary":" Convolutional networks and vision transformers have different forms of\npairwise interactions, pooling across layers and pooling at the end of the\nnetwork. Does the latter really need to be different? As a by-product of\npooling, vision transformers provide spatial attention for free, but this is\nmost often of low quality unless self-supervised, which is not well studied. Is\nsupervision really the problem?\n In this work, we develop a generic pooling framework and then we formulate a\nnumber of existing methods as instantiations. By discussing the properties of\neach group of methods, we derive SimPool, a simple attention-based pooling\nmechanism as a replacement of the default one for both convolutional and\ntransformer encoders. We find that, whether supervised or self-supervised, this\nimproves performance on pre-training and downstream tasks and provides\nattention maps delineating object boundaries in all cases. One could thus call\nSimPool universal. To our knowledge, we are the first to obtain attention maps\nin supervised transformers of at least as good quality as self-supervised,\nwithout explicit losses or modifying the architecture. Code at:\nhttps://github.com/billpsomas/simpool.\n","authors":["Bill Psomas","Ioannis Kakogeorgiou","Konstantinos Karantzalos","Yannis Avrithis"],"pdf_url":"https://arxiv.org/pdf/2309.06891v1.pdf","comment":"ICCV 2023. Code and models: https://github.com/billpsomas/simpool"},{"id":"http://arxiv.org/abs/2309.06882v1","updated":"2023-09-13T11:16:52Z","published":"2023-09-13T11:16:52Z","title":"ProMap: Datasets for Product Mapping in E-commerce","summary":" The goal of product mapping is to decide, whether two listings from two\ndifferent e-shops describe the same products. Existing datasets of matching and\nnon-matching pairs of products, however, often suffer from incomplete product\ninformation or contain only very distant non-matching products. Therefore,\nwhile predictive models trained on these datasets achieve good results on them,\nin practice, they are unusable as they cannot distinguish very similar but\nnon-matching pairs of products. This paper introduces two new datasets for\nproduct mapping: ProMapCz consisting of 1,495 Czech product pairs and ProMapEn\nconsisting of 1,555 English product pairs of matching and non-matching products\nmanually scraped from two pairs of e-shops. The datasets contain both images\nand textual descriptions of the products, including their specifications,\nmaking them one of the most complete datasets for product mapping.\nAdditionally, the non-matching products were selected in two phases, creating\ntwo types of non-matches -- close non-matches and medium non-matches. Even the\nmedium non-matches are pairs of products that are much more similar than\nnon-matches in other datasets -- for example, they still need to have the same\nbrand and similar name and price. After simple data preprocessing, several\nmachine learning algorithms were trained on these and two the other datasets to\ndemonstrate the complexity and completeness of ProMap datasets. ProMap datasets\nare presented as a golden standard for further research of product mapping\nfilling the gaps in existing ones.\n","authors":["Kateřina Macková","Martin Pilát"],"pdf_url":"https://arxiv.org/pdf/2309.06882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06908v2","updated":"2023-09-13T10:50:02Z","published":"2023-05-11T15:51:46Z","title":"CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency\n Model","summary":" Denoising diffusion probabilistic models (DDPMs) have shown promising\nperformance for speech synthesis. However, a large number of iterative steps\nare required to achieve high sample quality, which restricts the inference\nspeed. Maintaining sample quality while increasing sampling speed has become a\nchallenging task. In this paper, we propose a \"Co\"nsistency \"Mo\"del-based\n\"Speech\" synthesis method, CoMoSpeech, which achieve speech synthesis through a\nsingle diffusion sampling step while achieving high audio quality. The\nconsistency constraint is applied to distill a consistency model from a\nwell-designed diffusion-based teacher model, which ultimately yields superior\nperformances in the distilled CoMoSpeech. Our experiments show that by\ngenerating audio recordings by a single sampling step, the CoMoSpeech achieves\nan inference speed more than 150 times faster than real-time on a single NVIDIA\nA100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based\nspeech synthesis truly practical. Meanwhile, objective and subjective\nevaluations on text-to-speech and singing voice synthesis show that the\nproposed teacher models yield the best audio quality, and the one-step sampling\nbased CoMoSpeech achieves the best inference speed with better or comparable\naudio quality to other conventional multi-step diffusion model baselines. Audio\nsamples are available at https://comospeech.github.io/.\n","authors":["Zhen Ye","Wei Xue","Xu Tan","Jie Chen","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2305.06908v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.03581v2","updated":"2023-09-13T10:39:24Z","published":"2023-09-07T09:22:05Z","title":"Interactive Hyperparameter Optimization in Multi-Objective Problems via\n Preference Learning","summary":" Hyperparameter optimization (HPO) is important to leverage the full potential\nof machine learning (ML). In practice, users are often interested in\nmulti-objective (MO) problems, i.e., optimizing potentially conflicting\nobjectives, like accuracy and energy consumption. To tackle this, the vast\nmajority of MO-ML algorithms return a Pareto front of non-dominated machine\nlearning models to the user. Optimizing the hyperparameters of such algorithms\nis non-trivial as evaluating a hyperparameter configuration entails evaluating\nthe quality of the resulting Pareto front. In literature, there are known\nindicators that assess the quality of a Pareto front (e.g., hypervolume, R2) by\nquantifying different properties (e.g., volume, proximity to a reference\npoint). However, choosing the indicator that leads to the desired Pareto front\nmight be a hard task for a user. In this paper, we propose a human-centered\ninteractive HPO approach tailored towards multi-objective ML leveraging\npreference learning to extract desiderata from users that guide the\noptimization. Instead of relying on the user guessing the most suitable\nindicator for their needs, our approach automatically learns an appropriate\nindicator. Concretely, we leverage pairwise comparisons of distinct Pareto\nfronts to learn such an appropriate quality indicator. Then, we optimize the\nhyperparameters of the underlying MO-ML algorithm towards this learned\nindicator using a state-of-the-art HPO approach. In an experimental study\ntargeting the environmental impact of ML, we demonstrate that our approach\nleads to substantially better Pareto fronts compared to optimizing based on a\nwrong indicator pre-selected by the user, and performs comparable in the case\nof an advanced user knowing which indicator to pick.\n","authors":["Joseph Giovanelli","Alexander Tornede","Tanja Tornede","Marius Lindauer"],"pdf_url":"https://arxiv.org/pdf/2309.03581v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05282v2","updated":"2023-09-13T10:38:28Z","published":"2023-09-11T07:37:10Z","title":"Can you text what is happening? Integrating pre-trained language\n encoders into trajectory prediction models for autonomous driving","summary":" In autonomous driving tasks, scene understanding is the first step towards\npredicting the future behavior of the surrounding traffic participants. Yet,\nhow to represent a given scene and extract its features are still open research\nquestions. In this study, we propose a novel text-based representation of\ntraffic scenes and process it with a pre-trained language encoder.\n First, we show that text-based representations, combined with classical\nrasterized image representations, lead to descriptive scene embeddings. Second,\nwe benchmark our predictions on the nuScenes dataset and show significant\nimprovements compared to baselines. Third, we show in an ablation study that a\njoint encoder of text and rasterized images outperforms the individual encoders\nconfirming that both representations have their complementary strengths.\n","authors":["Ali Keysan","Andreas Look","Eitan Kosman","Gonca Gürsun","Jörg Wagner","Yu Yao","Barbara Rakitsch"],"pdf_url":"https://arxiv.org/pdf/2309.05282v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06869v1","updated":"2023-09-13T10:26:08Z","published":"2023-09-13T10:26:08Z","title":"Dynamic control of self-assembly of quasicrystalline structures through\n reinforcement learning","summary":" We propose reinforcement learning to control the dynamical self-assembly of\nthe dodecagonal quasicrystal (DDQC) from patchy particles. The patchy particles\nhave anisotropic interactions with other particles and form DDQC. However,\ntheir structures at steady states are significantly influenced by the kinetic\npathways of their structural formation. We estimate the best policy of\ntemperature control trained by the Q-learning method and demonstrate that we\ncan generate DDQC with few defects using the estimated policy. The temperature\nschedule obtained by reinforcement learning can reproduce the desired structure\nmore efficiently than the conventional pre-fixed temperature schedule, such as\nannealing. To clarify the success of the learning, we also analyse a simple\nmodel describing the kinetics of structural changes through the motion in a\ntriple-well potential. We have found that reinforcement learning autonomously\ndiscovers the critical temperature at which structural fluctuations enhance the\nchance of forming a globally stable state. The estimated policy guides the\nsystem toward the critical temperature to assist the formation of DDQC.\n","authors":["Uyen Tu Lieu","Natsuhiko Yoshinaga"],"pdf_url":"https://arxiv.org/pdf/2309.06869v1.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2306.17100v2","updated":"2023-09-13T10:12:09Z","published":"2023-06-29T16:57:22Z","title":"RL4CO: an Extensive Reinforcement Learning for Combinatorial\n Optimization Benchmark","summary":" We introduce RL4CO, an extensive reinforcement learning (RL) for\ncombinatorial optimization (CO) benchmark. RL4CO employs state-of-the-art\nsoftware libraries as well as best practices in implementation, such as\nmodularity and configuration management, to be efficient and easily modifiable\nby researchers for adaptations of neural network architecture, environments,\nand RL algorithms. Contrary to the existing focus on specific tasks like the\ntraveling salesman problem (TSP) for performance assessment, we underline the\nimportance of scalability and generalization capabilities for diverse CO tasks.\nWe also systematically benchmark zero-shot generalization, sample efficiency,\nand adaptability to changes in data distributions of various models. Our\nexperiments show that some recent SOTA methods fall behind their predecessors\nwhen evaluated using these metrics, suggesting the necessity for a more\nbalanced view of the performance of neural CO (NCO) solvers. We hope RL4CO will\nencourage the exploration of novel solutions to complex real-world tasks,\nallowing the NCO community to compare with existing methods through a\nstandardized interface that decouples the science from software engineering. We\nmake our library publicly available at https://github.com/kaist-silab/rl4co.\n","authors":["Federico Berto","Chuanbo Hua","Junyoung Park","Minsu Kim","Hyeonah Kim","Jiwoo Son","Haeyeon Kim","Joungho Kim","Jinkyoo Park"],"pdf_url":"https://arxiv.org/pdf/2306.17100v2.pdf","comment":"Added several improvements to the writing; add search methods; new\n results"},{"id":"http://arxiv.org/abs/2006.04178v2","updated":"2023-09-13T09:57:18Z","published":"2020-06-07T15:12:25Z","title":"Neural Vortex Method: from Finite Lagrangian Particles to Infinite\n Dimensional Eulerian Dynamics","summary":" In the field of fluid numerical analysis, there has been a long-standing\nproblem: lacking of a rigorous mathematical tool to map from a continuous flow\nfield to discrete vortex particles, hurdling the Lagrangian particles from\ninheriting the high resolution of a large-scale Eulerian solver. To tackle this\nchallenge, we propose a novel learning-based framework, the Neural Vortex\nMethod (NVM), which builds a neural-network description of the Lagrangian\nvortex structures and their interaction dynamics to reconstruct the\nhigh-resolution Eulerian flow field in a physically-precise manner. The key\ncomponents of our infrastructure consist of two networks: a vortex\nrepresentation network to identify the Lagrangian vortices from a grid-based\nvelocity field and a vortex interaction network to learn the underlying\ngoverning dynamics of these finite structures. By embedding these two networks\nwith a vorticity-to-velocity Poisson solver and training its parameters using\nthe high-fidelity data obtained from high-resolution direct numerical\nsimulation, we can predict the accurate fluid dynamics on a precision level\nthat was infeasible for all the previous conventional vortex methods (CVMs). To\nthe best of our knowledge, our method is the first approach that can utilize\nmotions of finite particles to learn infinite dimensional dynamic systems. We\ndemonstrate the efficacy of our method in generating highly accurate prediction\nresults, with low computational cost, of the leapfrogging vortex rings system,\nthe turbulence system, and the systems governed by Euler equations with\ndifferent external forces.\n","authors":["Shiying Xiong","Xingzhe He","Yunjin Tong","Yitong Deng","Bo Zhu"],"pdf_url":"https://arxiv.org/pdf/2006.04178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06838v1","updated":"2023-09-13T09:39:42Z","published":"2023-09-13T09:39:42Z","title":"Supervised Machine Learning and Physics based Machine Learning approach\n for prediction of peak temperature distribution in Additive Friction Stir\n Deposition of Aluminium Alloy","summary":" Additive friction stir deposition (AFSD) is a novel solid-state additive\nmanufacturing technique that circumvents issues of porosity, cracking, and\nproperties anisotropy that plague traditional powder bed fusion and directed\nenergy deposition approaches. However, correlations between process parameters,\nthermal profiles, and resulting microstructure in AFSD remain poorly\nunderstood. This hinders process optimization for properties. This work employs\na cutting-edge framework combining supervised machine learning (SML) and\nphysics-informed neural networks (PINNs) to predict peak temperature\ndistribution in AFSD from process parameters. Eight regression algorithms were\nimplemented for SML modeling, while four PINNs leveraged governing equations\nfor transport, wave propagation, heat transfer, and quantum mechanics. Across\nmultiple statistical measures, ensemble techniques like gradient boosting\nproved superior for SML, with lowest MSE of 165.78. The integrated ML approach\nwas also applied to classify deposition quality from process factors, with\nlogistic regression delivering robust accuracy. By fusing data-driven learning\nand fundamental physics, this dual methodology provides comprehensive insights\ninto tailoring microstructure through thermal management in AFSD. The work\ndemonstrates the power of bridging statistical and physics-based modeling for\nelucidating AM process-property relationships.\n","authors":["Akshansh Mishra"],"pdf_url":"https://arxiv.org/pdf/2309.06838v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06835v1","updated":"2023-09-13T09:34:21Z","published":"2023-09-13T09:34:21Z","title":"Safe Reinforcement Learning with Dual Robustness","summary":" Reinforcement learning (RL) agents are vulnerable to adversarial\ndisturbances, which can deteriorate task performance or compromise safety\nspecifications. Existing methods either address safety requirements under the\nassumption of no adversary (e.g., safe RL) or only focus on robustness against\nperformance adversaries (e.g., robust RL). Learning one policy that is both\nsafe and robust remains a challenging open problem. The difficulty is how to\ntackle two intertwined aspects in the worst cases: feasibility and optimality.\nOptimality is only valid inside a feasible region, while identification of\nmaximal feasible region must rely on learning the optimal policy. To address\nthis issue, we propose a systematic framework to unify safe RL and robust RL,\nincluding problem formulation, iteration scheme, convergence analysis and\npractical algorithm design. This unification is built upon constrained\ntwo-player zero-sum Markov games. A dual policy iteration scheme is proposed,\nwhich simultaneously optimizes a task policy and a safety policy. The\nconvergence of this iteration scheme is proved. Furthermore, we design a deep\nRL algorithm for practical implementation, called dually robust actor-critic\n(DRAC). The evaluations with safety-critical benchmarks demonstrate that DRAC\nachieves high performance and persistent safety under all scenarios (no\nadversary, safety adversary, performance adversary), outperforming all\nbaselines significantly.\n","authors":["Zeyang Li","Chuxiong Hu","Yunan Wang","Yujie Yang","Shengbo Eben Li"],"pdf_url":"https://arxiv.org/pdf/2309.06835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06828v1","updated":"2023-09-13T09:22:49Z","published":"2023-09-13T09:22:49Z","title":"UniBrain: Universal Brain MRI Diagnosis with Hierarchical\n Knowledge-enhanced Pre-training","summary":" Magnetic resonance imaging~(MRI) have played a crucial role in brain disease\ndiagnosis, with which a range of computer-aided artificial intelligence methods\nhave been proposed. However, the early explorations usually focus on the\nlimited types of brain diseases in one study and train the model on the data in\na small scale, yielding the bottleneck of generalization. Towards a more\neffective and scalable paradigm, we propose a hierarchical knowledge-enhanced\npre-training framework for the universal brain MRI diagnosis, termed as\nUniBrain. Specifically, UniBrain leverages a large-scale dataset of 24,770\nimaging-report pairs from routine diagnostics. Different from previous\npre-training techniques for the unitary vision or textual feature, or with the\nbrute-force alignment between vision and language information, we leverage the\nunique characteristic of report information in different granularity to build a\nhierarchical alignment mechanism, which strengthens the efficiency in feature\nlearning. Our UniBrain is validated on three real world datasets with severe\nclass imbalance and the public BraTS2019 dataset. It not only consistently\noutperforms all state-of-the-art diagnostic methods by a large margin and\nprovides a superior grounding performance but also shows comparable performance\ncompared to expert radiologists on certain disease types.\n","authors":["Jiayu Lei","Lisong Dai","Haoyun Jiang","Chaoyi Wu","Xiaoman Zhang","Yao Zhang","Jiangchao Yao","Weidi Xie","Yanyong Zhang","Yuehua Li","Ya Zhang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06828v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06814v1","updated":"2023-09-13T09:05:09Z","published":"2023-09-13T09:05:09Z","title":"Comparative Analysis of Contextual Relation Extraction based on Deep\n Learning Models","summary":" Contextual Relation Extraction (CRE) is mainly used for constructing a\nknowledge graph with a help of ontology. It performs various tasks such as\nsemantic search, query answering, and textual entailment. Relation extraction\nidentifies the entities from raw texts and the relations among them. An\nefficient and accurate CRE system is essential for creating domain knowledge in\nthe biomedical industry. Existing Machine Learning and Natural Language\nProcessing (NLP) techniques are not suitable to predict complex relations from\nsentences that consist of more than two relations and unspecified entities\nefficiently. In this work, deep learning techniques have been used to identify\nthe appropriate semantic relation based on the context from multiple sentences.\nEven though various machine learning models have been used for relation\nextraction, they provide better results only for binary relations, i.e.,\nrelations occurred exactly between the two entities in a sentence. Machine\nlearning models are not suited for complex sentences that consist of the words\nthat have various meanings. To address these issues, hybrid deep learning\nmodels have been used to extract the relations from complex sentence\neffectively. This paper explores the analysis of various deep learning models\nthat are used for relation extraction.\n","authors":["R. Priyadharshini","G. Jeyakodi","P. Shanthi Bala"],"pdf_url":"https://arxiv.org/pdf/2309.06814v1.pdf","comment":"This Paper Presented in the International Conference on FOSS\n Approaches towards Computational Intelligence and Language TTechnolog on\n February 2023, Thiruvananthapuram"},{"id":"http://arxiv.org/abs/2308.00507v2","updated":"2023-09-13T08:51:58Z","published":"2023-08-01T12:46:02Z","title":"Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT\n by Integrating Neural Distance and Texture-Aware Transformer","summary":" Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which\nthe tumor-vascular involvement greatly affects the resectability and, thus,\noverall survival of patients. However, current prognostic prediction methods\nfail to explicitly and accurately investigate relationships between the tumor\nand nearby important vessels. This paper proposes a novel learnable neural\ndistance that describes the precise relationship between the tumor and vessels\nin CT images of different patients, adopting it as a major feature for\nprognosis prediction. Besides, different from existing models that used CNNs or\nLSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT\nimaging, we improved the extraction of dynamic tumor-related texture features\nin multi-phase contrast-enhanced CT by fusing local and global features using\nCNN and transformer modules, further enhancing the features extracted across\nmulti-phase CT images. We extensively evaluated and compared the proposed\nmethod with existing methods in the multi-center (n=4) dataset with 1,070\npatients with PDAC, and statistical analysis confirmed its clinical\neffectiveness in the external test set consisting of three centers. The\ndeveloped risk marker was the strongest predictor of overall survival among\npreoperative factors and it has the potential to be combined with established\nclinical factors to select patients at higher risk who might benefit from\nneoadjuvant therapy.\n","authors":["Hexin Dong","Jiawen Yao","Yuxing Tang","Mingze Yuan","Yingda Xia","Jian Zhou","Hong Lu","Jingren Zhou","Bin Dong","Le Lu","Li Zhang","Zaiyi Liu","Yu Shi","Ling Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.00507v2.pdf","comment":"MICCAI 2023"},{"id":"http://arxiv.org/abs/2309.06805v1","updated":"2023-09-13T08:51:19Z","published":"2023-09-13T08:51:19Z","title":"FedDIP: Federated Learning with Extreme Dynamic Pruning and Incremental\n Regularization","summary":" Federated Learning (FL) has been successfully adopted for distributed\ntraining and inference of large-scale Deep Neural Networks (DNNs). However,\nDNNs are characterized by an extremely large number of parameters, thus,\nyielding significant challenges in exchanging these parameters among\ndistributed nodes and managing the memory. Although recent DNN compression\nmethods (e.g., sparsification, pruning) tackle such challenges, they do not\nholistically consider an adaptively controlled reduction of parameter exchange\nwhile maintaining high accuracy levels. We, therefore, contribute with a novel\nFL framework (coined FedDIP), which combines (i) dynamic model pruning with\nerror feedback to eliminate redundant information exchange, which contributes\nto significant performance improvement, with (ii) incremental regularization\nthat can achieve \\textit{extreme} sparsity of models. We provide convergence\nanalysis of FedDIP and report on a comprehensive performance and comparative\nassessment against state-of-the-art methods using benchmark data sets and DNN\nmodels. Our results showcase that FedDIP not only controls the model sparsity\nbut efficiently achieves similar or better performance compared to other model\npruning methods adopting incremental regularization during distributed model\ntraining. The code is available at: https://github.com/EricLoong/feddip.\n","authors":["Qianyu Long","Christos Anagnostopoulos","Shameem Puthiya Parambath","Daning Bi"],"pdf_url":"https://arxiv.org/pdf/2309.06805v1.pdf","comment":"Accepted for publication at ICDM 2023 (Full version in arxiv). The\n associated code is available at https://github.com/EricLoong/feddip"},{"id":"http://arxiv.org/abs/2309.06800v1","updated":"2023-09-13T08:48:00Z","published":"2023-09-13T08:48:00Z","title":"Uncertainty-aware Traffic Prediction under Missing Data","summary":" Traffic prediction is a crucial topic because of its broad scope of\napplications in the transportation domain. Recently, various studies have\nachieved promising results. However, most studies assume the prediction\nlocations have complete or at least partial historical records and cannot be\nextended to non-historical recorded locations. In real-life scenarios, the\ndeployment of sensors could be limited due to budget limitations and\ninstallation availability, which makes most current models not applicable.\nThough few pieces of literature tried to impute traffic states at the missing\nlocations, these methods need the data simultaneously observed at the locations\nwith sensors, making them not applicable to prediction tasks. Another drawback\nis the lack of measurement of uncertainty in prediction, making prior works\nunsuitable for risk-sensitive tasks or involving decision-making. To fill the\ngap, inspired by the previous inductive graph neural network, this work\nproposed an uncertainty-aware framework with the ability to 1) extend\nprediction to missing locations with no historical records and significantly\nextend spatial coverage of prediction locations while reducing deployment of\nsensors and 2) generate probabilistic prediction with uncertainty\nquantification to help the management of risk and decision making in the\ndown-stream tasks. Through extensive experiments on real-life datasets, the\nresult shows our method achieved promising results on prediction tasks, and the\nuncertainty quantification gives consistent results which highly correlated\nwith the locations with and without historical data. We also show that our\nmodel could help support sensor deployment tasks in the transportation field to\nachieve higher accuracy with a limited sensor deployment budget.\n","authors":["Hao Mei","Junxian Li","Zhiming Liang","Guanjie Zheng","Bin Shi","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2309.06800v1.pdf","comment":"11 pages, 3 figures, Accepted as a short paper of IEEE International\n Conference on Data Mining (ICDM) 2023"},{"id":"http://arxiv.org/abs/2301.00545v3","updated":"2023-09-13T08:47:35Z","published":"2023-01-02T07:13:28Z","title":"Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels","summary":" A noisy training set usually leads to the degradation of the generalization\nand robustness of neural networks. In this paper, we propose a novel\ntheoretically guaranteed clean sample selection framework for learning with\nnoisy labels. Specifically, we first present a Scalable Penalized Regression\n(SPR) method, to model the linear relation between network features and one-hot\nlabels. In SPR, the clean data are identified by the zero mean-shift parameters\nsolved in the regression model. We theoretically show that SPR can recover\nclean data under some conditions. Under general scenarios, the conditions may\nbe no longer satisfied; and some noisy data are falsely selected as clean data.\nTo solve this problem, we propose a data-adaptive method for Scalable Penalized\nRegression with Knockoff filters (Knockoffs-SPR), which is provable to control\nthe False-Selection-Rate (FSR) in the selected clean data. To improve the\nefficiency, we further present a split algorithm that divides the whole\ntraining set into small pieces that can be solved in parallel to make the\nframework scalable to large datasets. While Knockoffs-SPR can be regarded as a\nsample selection module for a standard supervised training pipeline, we further\ncombine it with a semi-supervised algorithm to exploit the support of noisy\ndata as unlabeled data. Experimental results on several benchmark datasets and\nreal-world noisy datasets show the effectiveness of our framework and validate\nthe theoretical results of Knockoffs-SPR. Our code and pre-trained models are\navailable at https://github.com/Yikai-Wang/Knockoffs-SPR.\n","authors":["Yikai Wang","Yanwei Fu","Xinwei Sun"],"pdf_url":"https://arxiv.org/pdf/2301.00545v3.pdf","comment":"update: refined theory and analysis, release code"},{"id":"http://arxiv.org/abs/2309.06794v1","updated":"2023-09-13T08:33:09Z","published":"2023-09-13T08:33:09Z","title":"Cognitive Mirage: A Review of Hallucinations in Large Language Models","summary":" As large language models continue to develop in the field of AI, text\ngeneration systems are susceptible to a worrisome phenomenon known as\nhallucination. In this study, we summarize recent compelling insights into\nhallucinations in LLMs. We present a novel taxonomy of hallucinations from\nvarious text generation tasks, thus provide theoretical insights, detection\nmethods and improvement approaches. Based on this, future research directions\nare proposed. Our contribution are threefold: (1) We provide a detailed and\ncomplete taxonomy for hallucinations appearing in text generation tasks; (2) We\nprovide theoretical analyses of hallucinations in LLMs and provide existing\ndetection and improvement methods; (3) We propose several research directions\nthat can be developed in the future. As hallucinations garner significant\nattention from the community, we will maintain updates on relevant research\nprogress.\n","authors":["Hongbin Ye","Tong Liu","Aijia Zhang","Wei Hua","Weiqiang Jia"],"pdf_url":"https://arxiv.org/pdf/2309.06794v1.pdf","comment":"work in progress; 21 pages"},{"id":"http://arxiv.org/abs/2309.06793v1","updated":"2023-09-13T08:28:16Z","published":"2023-09-13T08:28:16Z","title":"Electricity Demand Forecasting through Natural Language Processing with\n Long Short-Term Memory Networks","summary":" Electricity demand forecasting is a well established research field. Usually\nthis task is performed considering historical loads, weather forecasts,\ncalendar information and known major events. Recently attention has been given\non the possible use of new sources of information from textual news in order to\nimprove the performance of these predictions. This paper proposes a Long and\nShort-Term Memory (LSTM) network incorporating textual news features that\nsuccessfully predicts the deterministic and probabilistic tasks of the UK\nnational electricity demand. The study finds that public sentiment and word\nvector representations related to transport and geopolitics have\ntime-continuity effects on electricity demand. The experimental results show\nthat the LSTM with textual features improves by more than 3% compared to the\npure LSTM benchmark and by close to 10% over the official benchmark.\nFurthermore, the proposed model effectively reduces forecasting uncertainty by\nnarrowing the confidence interval and bringing the forecast distribution closer\nto the truth.\n","authors":["Yun Bai","Simon Camal","Andrea Michiorri"],"pdf_url":"https://arxiv.org/pdf/2309.06793v1.pdf","comment":"5 pages, 3 figures, 2023 IEEE PES Innovative Smart Grid Technologies\n Conference Europe (ISGT-Europe)"},{"id":"http://arxiv.org/abs/2309.06782v1","updated":"2023-09-13T08:16:15Z","published":"2023-09-13T08:16:15Z","title":"Scalable neural network models and terascale datasets for particle-flow\n reconstruction","summary":" We study scalable machine learning models for full event reconstruction in\nhigh-energy electron-positron collisions based on a highly granular detector\nsimulation. Particle-flow (PF) reconstruction can be formulated as a supervised\nlearning task using tracks and calorimeter clusters or hits. We compare a graph\nneural network and kernel-based transformer and demonstrate that both avoid\nquadratic memory allocation and computational cost while achieving realistic PF\nreconstruction. We show that hyperparameter tuning on a supercomputer\nsignificantly improves the physics performance of the models. We also\ndemonstrate that the resulting model is highly portable across hardware\nprocessors, supporting Nvidia, AMD, and Intel Habana cards. Finally, we\ndemonstrate that the model can be trained on highly granular inputs consisting\nof tracks and calorimeter hits, resulting in a competitive physics performance\nwith the baseline. Datasets and software to reproduce the studies are published\nfollowing the findable, accessible, interoperable, and reusable (FAIR)\nprinciples.\n","authors":["Joosep Pata","Eric Wulff","Farouk Mokhtar","David Southwick","Mengke Zhang","Maria Girone","Javier Duarte"],"pdf_url":"https://arxiv.org/pdf/2309.06782v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2211.08413v5","updated":"2023-09-13T08:12:26Z","published":"2022-11-15T18:51:20Z","title":"Decentralized Federated Learning: Fundamentals, State of the Art,\n Frameworks, Trends, and Challenges","summary":" In recent years, Federated Learning (FL) has gained relevance in training\ncollaborative models without sharing sensitive data. Since its birth,\nCentralized FL (CFL) has been the most common approach in the literature, where\na central entity creates a global model. However, a centralized approach leads\nto increased latency due to bottlenecks, heightened vulnerability to system\nfailures, and trustworthiness concerns affecting the entity responsible for the\nglobal model creation. Decentralized Federated Learning (DFL) emerged to\naddress these concerns by promoting decentralized model aggregation and\nminimizing reliance on centralized architectures. However, despite the work\ndone in DFL, the literature has not (i) studied the main aspects\ndifferentiating DFL and CFL; (ii) analyzed DFL frameworks to create and\nevaluate new solutions; and (iii) reviewed application scenarios using DFL.\nThus, this article identifies and analyzes the main fundamentals of DFL in\nterms of federation architectures, topologies, communication mechanisms,\nsecurity approaches, and key performance indicators. Additionally, the paper at\nhand explores existing mechanisms to optimize critical DFL fundamentals. Then,\nthe most relevant features of the current DFL frameworks are reviewed and\ncompared. After that, it analyzes the most used DFL application scenarios,\nidentifying solutions based on the fundamentals and frameworks previously\ndefined. Finally, the evolution of existing DFL solutions is studied to provide\na list of trends, lessons learned, and open challenges.\n","authors":["Enrique Tomás Martínez Beltrán","Mario Quiles Pérez","Pedro Miguel Sánchez Sánchez","Sergio López Bernal","Gérôme Bovet","Manuel Gil Pérez","Gregorio Martínez Pérez","Alberto Huertas Celdrán"],"pdf_url":"https://arxiv.org/pdf/2211.08413v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06774v1","updated":"2023-09-13T07:49:28Z","published":"2023-09-13T07:49:28Z","title":"Fundamental Limits of Deep Learning-Based Binary Classifiers Trained\n with Hinge Loss","summary":" Although deep learning (DL) has led to several breakthroughs in many\ndisciplines as diverse as chemistry, computer science, electrical engineering,\nmathematics, medicine, neuroscience, and physics, a comprehensive understanding\nof why and how DL is empirically successful remains fundamentally elusive. To\nattack this fundamental problem and unravel the mysteries behind DL's empirical\nsuccesses, significant innovations toward a unified theory of DL have been\nmade. These innovations encompass nearly fundamental advances in optimization,\ngeneralization, and approximation. Despite these advances, however, no work to\ndate has offered a way to quantify the testing performance of a DL-based\nalgorithm employed to solve a pattern classification problem. To overcome this\nfundamental challenge in part, this paper exposes the fundamental testing\nperformance limits of DL-based binary classifiers trained with hinge loss. For\nbinary classifiers that are based on deep rectified linear unit (ReLU)\nfeedforward neural networks (FNNs) and ones that are based on deep FNNs with\nReLU and Tanh activation, we derive their respective novel asymptotic testing\nperformance limits. The derived testing performance limits are validated by\nextensive computer experiments.\n","authors":["Tilahun M. Getu","Georges Kaddoum"],"pdf_url":"https://arxiv.org/pdf/2309.06774v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04434v2","updated":"2023-09-13T07:32:35Z","published":"2023-09-08T16:55:39Z","title":"Physics-Informed Neural Networks for an optimal counterdiabatic quantum\n computation","summary":" We introduce a novel methodology that leverages the strength of\nPhysics-Informed Neural Networks (PINNs) to address the counterdiabatic (CD)\nprotocol in the optimization of quantum circuits comprised of systems with\n$N_{Q}$ qubits. The primary objective is to utilize physics-inspired deep\nlearning techniques to accurately solve the time evolution of the different\nphysical observables within the quantum system. To accomplish this objective,\nwe embed the necessary physical information into an underlying neural network\nto effectively tackle the problem. In particular, we impose the hermiticity\ncondition on all physical observables and make use of the principle of least\naction, guaranteeing the acquisition of the most appropriate counterdiabatic\nterms based on the underlying physics. The proposed approach offers a\ndependable alternative to address the CD driving problem, free from the\nconstraints typically encountered in previous methodologies relying on\nclassical numerical approximations. Our method provides a general framework to\nobtain optimal results from the physical observables relevant to the problem,\nincluding the external parameterization in time known as scheduling function,\nthe gauge potential or operator involving the non-adiabatic terms, as well as\nthe temporal evolution of the energy levels of the system, among others. The\nmain applications of this methodology have been the $\\mathrm{H_{2}}$ and\n$\\mathrm{LiH}$ molecules, represented by a 2-qubit and 4-qubit systems\nemploying the STO-3G basis. The presented results demonstrate the successful\nderivation of a desirable decomposition for the non-adiabatic terms, achieved\nthrough a linear combination utilizing Pauli operators. This attribute confers\nsignificant advantages to its practical implementation within quantum computing\nalgorithms.\n","authors":["Antonio Ferrer-Sánchez","Carlos Flores-Garrigos","Carlos Hernani-Morales","José J. Orquín-Marqués","Narendra N. Hegade","Alejandro Gomez Cadavid","Iraitz Montalban","Enrique Solano","Yolanda Vives-Gilabert","José D. Martín-Guerrero"],"pdf_url":"https://arxiv.org/pdf/2309.04434v2.pdf","comment":"28 pages, 10 figures, 1 algorithm, 1 table"},{"id":"http://arxiv.org/abs/2309.05575v2","updated":"2023-09-13T06:53:15Z","published":"2023-09-11T16:03:00Z","title":"Anisotropic Diffusion Stencils: From Simple Derivations over Stability\n Estimates to ResNet Implementations","summary":" Anisotropic diffusion processes with a diffusion tensor are important in\nimage analysis, physics, and engineering. However, their numerical\napproximation has a strong impact on dissipative artefacts and deviations from\nrotation invariance. In this work, we study a large family of finite difference\ndiscretisations on a 3 x 3 stencil. We derive it by splitting 2-D anisotropic\ndiffusion into four 1-D diffusions. The resulting stencil class involves one\nfree parameter and covers a wide range of existing discretisations. It\ncomprises the full stencil family of Weickert et al. (2013) and shows that\ntheir two parameters contain redundancy. Furthermore, we establish a bound on\nthe spectral norm of the matrix corresponding to the stencil. This gives time\nstep size limits that guarantee stability of an explicit scheme in the\nEuclidean norm. Our directional splitting also allows a very natural\ntranslation of the explicit scheme into ResNet blocks. Employing neural network\nlibraries enables simple and highly efficient parallel implementations on GPUs.\n","authors":["Karl Schrader","Joachim Weickert","Michael Krause"],"pdf_url":"https://arxiv.org/pdf/2309.05575v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06742v1","updated":"2023-09-13T06:23:58Z","published":"2023-09-13T06:23:58Z","title":"MTD: Multi-Timestep Detector for Delayed Streaming Perception","summary":" Autonomous driving systems require real-time environmental perception to\nensure user safety and experience. Streaming perception is a task of reporting\nthe current state of the world, which is used to evaluate the delay and\naccuracy of autonomous driving systems. In real-world applications, factors\nsuch as hardware limitations and high temperatures inevitably cause delays in\nautonomous driving systems, resulting in the offset between the model output\nand the world state. In order to solve this problem, this paper propose the\nMulti- Timestep Detector (MTD), an end-to-end detector which uses dynamic\nrouting for multi-branch future prediction, giving model the ability to resist\ndelay fluctuations. A Delay Analysis Module (DAM) is proposed to optimize the\nexisting delay sensing method, continuously monitoring the model inference\nstack and calculating the delay trend. Moreover, a novel Timestep Branch Module\n(TBM) is constructed, which includes static flow and adaptive flow to\nadaptively predict specific timesteps according to the delay trend. The\nproposed method has been evaluated on the Argoverse-HD dataset, and the\nexperimental results show that it has achieved state-of-the-art performance\nacross various delay settings.\n","authors":["Yihui Huang","Ningjiang Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06742v1.pdf","comment":"12 pages, accepted by PRCV 2023 (The 6th Chinese Conference on\n Pattern Recognition and Computer Vision)"},{"id":"http://arxiv.org/abs/2304.04602v2","updated":"2023-09-13T06:19:35Z","published":"2023-04-10T14:17:33Z","title":"Learning a Universal Human Prior for Dexterous Manipulation from Human\n Preference","summary":" Generating human-like behavior on robots is a great challenge especially in\ndexterous manipulation tasks with robotic hands. Scripting policies from\nscratch is intractable due to the high-dimensional control space, and training\npolicies with reinforcement learning (RL) and manual reward engineering can\nalso be hard and lead to unnatural motions. Leveraging the recent progress on\nRL from Human Feedback, we propose a framework that learns a universal human\nprior using direct human preference feedback over videos, for efficiently\ntuning the RL policies on 20 dual-hand robot manipulation tasks in simulation,\nwithout a single human demonstration. A task-agnostic reward model is trained\nthrough iteratively generating diverse polices and collecting human preference\nover the trajectories; it is then applied for regularizing the behavior of\npolices in the fine-tuning stage. Our method empirically demonstrates more\nhuman-like behaviors on robot hands in diverse tasks including even unseen\ntasks, indicating its generalization capability.\n","authors":["Zihan Ding","Yuanpei Chen","Allen Z. Ren","Shixiang Shane Gu","Qianxu Wang","Hao Dong","Chi Jin"],"pdf_url":"https://arxiv.org/pdf/2304.04602v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06739v1","updated":"2023-09-13T06:15:37Z","published":"2023-09-13T06:15:37Z","title":"MCNS: Mining Causal Natural Structures Inside Time Series via A Novel\n Internal Causality Scheme","summary":" Causal inference permits us to discover covert relationships of various\nvariables in time series. However, in most existing works, the variables\nmentioned above are the dimensions. The causality between dimensions could be\ncursory, which hinders the comprehension of the internal relationship and the\nbenefit of the causal graph to the neural networks (NNs). In this paper, we\nfind that causality exists not only outside but also inside the time series\nbecause it reflects a succession of events in the real world. It inspires us to\nseek the relationship between internal subsequences. However, the challenges\nare the hardship of discovering causality from subsequences and utilizing the\ncausal natural structures to improve NNs. To address these challenges, we\npropose a novel framework called Mining Causal Natural Structure (MCNS), which\nis automatic and domain-agnostic and helps to find the causal natural\nstructures inside time series via the internal causality scheme. We evaluate\nthe MCNS framework and impregnation NN with MCNS on time series classification\ntasks. Experimental results illustrate that our impregnation, by refining\nattention, shape selection classification, and pruning datasets, drives NN,\neven the data itself preferable accuracy and interpretability. Besides, MCNS\nprovides an in-depth, solid summary of the time series and datasets.\n","authors":["Yuanhao Liu","Dehui Du","Zihan Jiang","Anyan Huang","Yiyang Li"],"pdf_url":"https://arxiv.org/pdf/2309.06739v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2301.10908v4","updated":"2023-09-13T06:11:12Z","published":"2023-01-26T02:38:37Z","title":"Distilling Cognitive Backdoor Patterns within an Image","summary":" This paper proposes a simple method to distill and detect backdoor patterns\nwithin an image: \\emph{Cognitive Distillation} (CD). The idea is to extract the\n\"minimal essence\" from an input image responsible for the model's prediction.\nCD optimizes an input mask to extract a small pattern from the input image that\ncan lead to the same model output (i.e., logits or deep features). The\nextracted pattern can help understand the cognitive mechanism of a model on\nclean vs. backdoor images and is thus called a \\emph{Cognitive Pattern} (CP).\nUsing CD and the distilled CPs, we uncover an interesting phenomenon of\nbackdoor attacks: despite the various forms and sizes of trigger patterns used\nby different attacks, the CPs of backdoor samples are all surprisingly and\nsuspiciously small. One thus can leverage the learned mask to detect and remove\nbackdoor examples from poisoned training datasets. We conduct extensive\nexperiments to show that CD can robustly detect a wide range of advanced\nbackdoor attacks. We also show that CD can potentially be applied to help\ndetect potential biases from face datasets. Code is available at\n\\url{https://github.com/HanxunH/CognitiveDistillation}.\n","authors":["Hanxun Huang","Xingjun Ma","Sarah Erfani","James Bailey"],"pdf_url":"https://arxiv.org/pdf/2301.10908v4.pdf","comment":"ICLR2023"},{"id":"http://arxiv.org/abs/2111.12550v3","updated":"2023-09-13T05:56:46Z","published":"2021-11-19T05:32:59Z","title":"A Worker-Task Specialization Model for Crowdsourcing: Efficient\n Inference and Fundamental Limits","summary":" Crowdsourcing system has emerged as an effective platform for labeling data\nwith relatively low cost by using non-expert workers. Inferring correct labels\nfrom multiple noisy answers on data, however, has been a challenging problem,\nsince the quality of the answers varies widely across tasks and workers. Many\nexisting works have assumed that there is a fixed ordering of workers in terms\nof their skill levels, and focused on estimating worker skills to aggregate the\nanswers from workers with different weights. In practice, however, the worker\nskill changes widely across tasks, especially when the tasks are heterogeneous.\nIn this paper, we consider a new model, called $d$-type specialization model,\nin which each task and worker has its own (unknown) type and the reliability of\neach worker can vary in the type of a given task and that of a worker. We allow\nthat the number $d$ of types can scale in the number of tasks. In this model,\nwe characterize the optimal sample complexity to correctly infer the labels\nwithin any given accuracy, and propose label inference algorithms achieving the\norder-wise optimal limit even when the types of tasks or those of workers are\nunknown. We conduct experiments both on synthetic and real datasets, and show\nthat our algorithm outperforms the existing algorithms developed based on more\nstrict model assumptions.\n","authors":["Doyeon Kim","Jeonghwan Lee","Hye Won Chung"],"pdf_url":"https://arxiv.org/pdf/2111.12550v3.pdf","comment":"To appear at IEEE Transactions on Information Theory"},{"id":"http://arxiv.org/abs/2307.02799v2","updated":"2023-09-13T05:46:25Z","published":"2023-07-06T06:17:57Z","title":"Few-shot Personalized Saliency Prediction Based on Inter-personnel Gaze\n Patterns","summary":" This paper presents few-shot personalized saliency prediction based on\ninter-personnel gaze patterns. In contrast to a general saliency map, a\npersonalized saliecny map (PSM) has been great potential since its map\nindicates the person-specific visual attention that is useful for obtaining\nindividual visual preferences from heterogeneity of gazed areas. The PSM\nprediction is needed for acquiring the PSM for the unseen image, but its\nprediction is still a challenging task due to the complexity of individual gaze\npatterns. For modeling individual gaze patterns for various images, although\nthe eye-tracking data obtained from each person is necessary to construct PSMs,\nit is difficult to acquire the massive amounts of such data. Here, one solution\nfor efficient PSM prediction from the limited amount of data can be the\neffective use of eye-tracking data obtained from other persons. In this paper,\nto effectively treat the PSMs of other persons, we focus on the effective\nselection of images to acquire eye-tracking data and the preservation of\nstructural information of PSMs of other persons. In the experimental results,\nwe confirm that the above two focuses are effective for the PSM prediction with\nthe limited amount of eye-tracking data.\n","authors":["Yuya Moroto","Keisuke Maeda","Takahiro Ogawa","Miki Haseyama"],"pdf_url":"https://arxiv.org/pdf/2307.02799v2.pdf","comment":"5pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.06724v1","updated":"2023-09-13T04:57:12Z","published":"2023-09-13T04:57:12Z","title":"Deep Nonparametric Convexified Filtering for Computational Photography,\n Image Synthesis and Adversarial Defense","summary":" We aim to provide a general framework of for computational photography that\nrecovers the real scene from imperfect images, via the Deep Nonparametric\nConvexified Filtering (DNCF). It is consists of a nonparametric deep network to\nresemble the physical equations behind the image formation, such as denoising,\nsuper-resolution, inpainting, and flash. DNCF has no parameterization dependent\non training data, therefore has a strong generalization and robustness to\nadversarial image manipulation. During inference, we also encourage the network\nparameters to be nonnegative and create a bi-convex function on the input and\nparameters, and this adapts to second-order optimization algorithms with\ninsufficient running time, having 10X acceleration over Deep Image Prior. With\nthese tools, we empirically verify its capability to defend image\nclassification deep networks against adversary attack algorithms in real-time.\n","authors":["Jianqiao Wangni"],"pdf_url":"https://arxiv.org/pdf/2309.06724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06717v1","updated":"2023-09-13T04:40:08Z","published":"2023-09-13T04:40:08Z","title":"Bias Amplification Enhances Minority Group Performance","summary":" Neural networks produced by standard training are known to suffer from poor\naccuracy on rare subgroups despite achieving high accuracy on average, due to\nthe correlations between certain spurious features and labels. Previous\napproaches based on worst-group loss minimization (e.g. Group-DRO) are\neffective in improving worse-group accuracy but require expensive group\nannotations for all the training samples. In this paper, we focus on the more\nchallenging and realistic setting where group annotations are only available on\na small validation set or are not available at all. We propose BAM, a novel\ntwo-stage training algorithm: in the first stage, the model is trained using a\nbias amplification scheme via introducing a learnable auxiliary variable for\neach training sample; in the second stage, we upweight the samples that the\nbias-amplified model misclassifies, and then continue training the same model\non the reweighted dataset. Empirically, BAM achieves competitive performance\ncompared with existing methods evaluated on spurious correlation benchmarks in\ncomputer vision and natural language processing. Moreover, we find a simple\nstopping criterion based on minimum class accuracy difference that can remove\nthe need for group annotations, with little or no loss in worst-group accuracy.\nWe perform extensive analyses and ablations to verify the effectiveness and\nrobustness of our algorithm in varying class and group imbalance ratios.\n","authors":["Gaotang Li","Jiarui Liu","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06717v1.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2309.06710v1","updated":"2023-09-13T04:17:28Z","published":"2023-09-13T04:17:28Z","title":"Crystal structure prediction using neural network potential and\n age-fitness Pareto genetic algorithm","summary":" While crystal structure prediction (CSP) remains a longstanding challenge, we\nintroduce ParetoCSP, a novel algorithm for CSP, which combines a\nmulti-objective genetic algorithm (MOGA) with a neural network inter-atomic\npotential (IAP) model to find energetically optimal crystal structures given\nchemical compositions. We enhance the NSGA-III algorithm by incorporating the\ngenotypic age as an independent optimization criterion and employ the M3GNet\nuniversal IAP to guide the GA search. Compared to GN-OA, a state-of-the-art\nneural potential based CSP algorithm, ParetoCSP demonstrated significantly\nbetter predictive capabilities, outperforming by a factor of $2.562$ across\n$55$ diverse benchmark structures, as evaluated by seven performance metrics.\nTrajectory analysis of the traversed structures of all algorithms shows that\nParetoCSP generated more valid structures than other algorithms, which helped\nguide the GA to search more effectively for the optimal structures\n","authors":["Sadman Sadeed Omee","Lai Wei","Jianjun Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06708v1","updated":"2023-09-13T04:13:11Z","published":"2023-09-13T04:13:11Z","title":"Predicting Fatigue Crack Growth via Path Slicing and Re-Weighting","summary":" Predicting potential risks associated with the fatigue of key structural\ncomponents is crucial in engineering design. However, fatigue often involves\nentangled complexities of material microstructures and service conditions,\nmaking diagnosis and prognosis of fatigue damage challenging. We report a\nstatistical learning framework to predict the growth of fatigue cracks and the\nlife-to-failure of the components under loading conditions with uncertainties.\nDigital libraries of fatigue crack patterns and the remaining life are\nconstructed by high-fidelity physical simulations. Dimensionality reduction and\nneural network architectures are then used to learn the history dependence and\nnonlinearity of fatigue crack growth. Path-slicing and re-weighting techniques\nare introduced to handle the statistical noises and rare events. The predicted\nfatigue crack patterns are self-updated and self-corrected by the evolving\ncrack patterns. The end-to-end approach is validated by representative examples\nwith fatigue cracks in plates, which showcase the digital-twin scenario in\nreal-time structural health monitoring and fatigue life prediction for\nmaintenance management decision-making.\n","authors":["Yingjie Zhao","Yong Liu","Zhiping Xu"],"pdf_url":"https://arxiv.org/pdf/2309.06708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/1911.10737v5","updated":"2023-09-13T04:03:20Z","published":"2019-11-25T07:31:54Z","title":"Nearest Neighbor Sampling of Point Sets using Rays","summary":" We propose a new framework for the sampling, compression, and analysis of\ndistributions of point sets and other geometric objects embedded in Euclidean\nspaces. Our approach involves constructing a tensor called the RaySense sketch,\nwhich captures nearest neighbors from the underlying geometry of points along a\nset of rays. We explore various operations that can be performed on the\nRaySense sketch, leading to different properties and potential applications.\nStatistical information about the data set can be extracted from the sketch,\nindependent of the ray set. Line integrals on point sets can be efficiently\ncomputed using the sketch. We also present several examples illustrating\napplications of the proposed strategy in practical scenarios.\n","authors":["Liangchen Liu","Louis Ly","Colin Macdonald","Yen-Hsi Richard Tsai"],"pdf_url":"https://arxiv.org/pdf/1911.10737v5.pdf","comment":"48 pages, 14 figures, accepted to Communication on Applied\n Mathematics and Computation (CAMC), Focused Issue in Honor of Prof. Stanley\n Osher on the Occasion of His 80th Birthday. Fixed typos and improved\n notations"},{"id":"http://arxiv.org/abs/2309.06703v1","updated":"2023-09-13T04:02:38Z","published":"2023-09-13T04:02:38Z","title":"VLSlice: Interactive Vision-and-Language Slice Discovery","summary":" Recent work in vision-and-language demonstrates that large-scale pretraining\ncan learn generalizable models that are efficiently transferable to downstream\ntasks. While this may improve dataset-scale aggregate metrics, analyzing\nperformance around hand-crafted subgroups targeting specific bias dimensions\nreveals systemic undesirable behaviors. However, this subgroup analysis is\nfrequently stalled by annotation efforts, which require extensive time and\nresources to collect the necessary data. Prior art attempts to automatically\ndiscover subgroups to circumvent these constraints but typically leverages\nmodel behavior on existing task-specific annotations and rapidly degrades on\nmore complex inputs beyond \"tabular\" data, none of which study\nvision-and-language models. This paper presents VLSlice, an interactive system\nenabling user-guided discovery of coherent representation-level subgroups with\nconsistent visiolinguistic behavior, denoted as vision-and-language slices,\nfrom unlabeled image sets. We show that VLSlice enables users to quickly\ngenerate diverse high-coherency slices in a user study (n=22) and release the\ntool publicly.\n","authors":["Eric Slyman","Minsuk Kahng","Stefan Lee"],"pdf_url":"https://arxiv.org/pdf/2309.06703v1.pdf","comment":"Conference paper at ICCV 2023. 17 pages, 11 figures.\n https://ericslyman.com/vlslice/"},{"id":"http://arxiv.org/abs/2309.06692v1","updated":"2023-09-13T03:27:21Z","published":"2023-09-13T03:27:21Z","title":"Tackling the Non-IID Issue in Heterogeneous Federated Learning by\n Gradient Harmonization","summary":" Federated learning (FL) is a privacy-preserving paradigm for collaboratively\ntraining a global model from decentralized clients. However, the performance of\nFL is hindered by non-independent and identically distributed (non-IID) data\nand device heterogeneity. In this work, we revisit this key challenge through\nthe lens of gradient conflicts on the server side. Specifically, we first\ninvestigate the gradient conflict phenomenon among multiple clients and reveal\nthat stronger heterogeneity leads to more severe gradient conflicts. To tackle\nthis issue, we propose FedGH, a simple yet effective method that mitigates\nlocal drifts through Gradient Harmonization. This technique projects one\ngradient vector onto the orthogonal plane of the other within conflicting\nclient pairs. Extensive experiments demonstrate that FedGH consistently\nenhances multiple state-of-the-art FL baselines across diverse benchmarks and\nnon-IID scenarios. Notably, FedGH yields more significant improvements in\nscenarios with stronger heterogeneity. As a plug-and-play module, FedGH can be\nseamlessly integrated into any FL framework without requiring hyperparameter\ntuning.\n","authors":["Xinyu Zhang","Weiyu Sun","Ying Chen"],"pdf_url":"https://arxiv.org/pdf/2309.06692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06684v1","updated":"2023-09-13T02:49:32Z","published":"2023-09-13T02:49:32Z","title":"Attention Loss Adjusted Prioritized Experience Replay","summary":" Prioritized Experience Replay (PER) is a technical means of deep\nreinforcement learning by selecting experience samples with more knowledge\nquantity to improve the training rate of neural network. However, the\nnon-uniform sampling used in PER inevitably shifts the state-action space\ndistribution and brings the estimation error of Q-value function. In this\npaper, an Attention Loss Adjusted Prioritized (ALAP) Experience Replay\nalgorithm is proposed, which integrates the improved Self-Attention network\nwith Double-Sampling mechanism to fit the hyperparameter that can regulate the\nimportance sampling weights to eliminate the estimation error caused by PER. In\norder to verify the effectiveness and generality of the algorithm, the ALAP is\ntested with value-function based, policy-gradient based and multi-agent\nreinforcement learning algorithms in OPENAI gym, and comparison studies verify\nthe advantage and efficiency of the proposed training framework.\n","authors":["Zhuoying Chen","Huiping Li","Rizhong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.06684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06683v1","updated":"2023-09-13T02:44:01Z","published":"2023-09-13T02:44:01Z","title":"Federated PAC-Bayesian Learning on Non-IID data","summary":" Existing research has either adapted the Probably Approximately Correct (PAC)\nBayesian framework for federated learning (FL) or used information-theoretic\nPAC-Bayesian bounds while introducing their theorems, but few considering the\nnon-IID challenges in FL. Our work presents the first non-vacuous federated\nPAC-Bayesian bound tailored for non-IID local data. This bound assumes unique\nprior knowledge for each client and variable aggregation weights. We also\nintroduce an objective function and an innovative Gibbs-based algorithm for the\noptimization of the derived bound. The results are validated on real-world\ndatasets.\n","authors":["Zihao Zhao","Yang Liu","Wenbo Ding","Xiao-Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.06683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06679v1","updated":"2023-09-13T02:34:21Z","published":"2023-09-13T02:34:21Z","title":"Generalizable improvement of the Spalart-Allmaras model through\n assimilation of experimental data","summary":" This study focuses on the use of model and data fusion for improving the\nSpalart-Allmaras (SA) closure model for Reynolds-averaged Navier-Stokes\nsolutions of separated flows. In particular, our goal is to develop of models\nthat not-only assimilate sparse experimental data to improve performance in\ncomputational models, but also generalize to unseen cases by recovering\nclassical SA behavior. We achieve our goals using data assimilation, namely the\nEnsemble Kalman Filtering approach (EnKF), to calibrate the coefficients of the\nSA model for separated flows. A holistic calibration strategy is implemented\nvia a parameterization of the production, diffusion, and destruction terms.\nThis calibration relies on the assimilation of experimental data collected\nvelocity profiles, skin friction, and pressure coefficients for separated\nflows. Despite using of observational data from a single flow condition around\na backward-facing step (BFS), the recalibrated SA model demonstrates\ngeneralization to other separated flows, including cases such as the 2D-bump\nand modified BFS. Significant improvement is observed in the quantities of\ninterest, i.e., skin friction coefficient ($C_f$) and pressure coefficient\n($C_p$) for each flow tested. Finally, it is also demonstrated that the newly\nproposed model recovers SA proficiency for external, unseparated flows, such as\nflow around a NACA-0012 airfoil without any danger of extrapolation, and that\nthe individually calibrated terms in the SA model are targeted towards specific\nflow-physics wherein the calibrated production term improves the re-circulation\nzone while destruction improves the recovery zone.\n","authors":["Deepinder Jot Singh Aulakh","Romit Maulik"],"pdf_url":"https://arxiv.org/pdf/2309.06679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06661v1","updated":"2023-09-13T01:32:46Z","published":"2023-09-13T01:32:46Z","title":"Sound field decomposition based on two-stage neural networks","summary":" A method for sound field decomposition based on neural networks is proposed.\nThe method comprises two stages: a sound field separation stage and a\nsingle-source localization stage. In the first stage, the sound pressure at\nmicrophones synthesized by multiple sources is separated into one excited by\neach sound source. In the second stage, the source location is obtained as a\nregression from the sound pressure at microphones consisting of a single sound\nsource. The estimated location is not affected by discretization because the\nsecond stage is designed as a regression rather than a classification. Datasets\nare generated by simulation using Green's function, and the neural network is\ntrained for each frequency. Numerical experiments reveal that, compared with\nconventional methods, the proposed method can achieve higher\nsource-localization accuracy and higher sound-field-reconstruction accuracy.\n","authors":["Ryo Matsuda","Makoto Otani"],"pdf_url":"https://arxiv.org/pdf/2309.06661v1.pdf","comment":"31 pages, 16 figures"},{"id":"http://arxiv.org/abs/2309.06660v1","updated":"2023-09-13T01:22:16Z","published":"2023-09-13T01:22:16Z","title":"Generalizable Neural Fields as Partially Observed Neural Processes","summary":" Neural fields, which represent signals as a function parameterized by a\nneural network, are a promising alternative to traditional discrete vector or\ngrid-based representations. Compared to discrete representations, neural\nrepresentations both scale well with increasing resolution, are continuous, and\ncan be many-times differentiable. However, given a dataset of signals that we\nwould like to represent, having to optimize a separate neural field for each\nsignal is inefficient, and cannot capitalize on shared information or\nstructures among signals. Existing generalization methods view this as a\nmeta-learning problem and employ gradient-based meta-learning to learn an\ninitialization which is then fine-tuned with test-time optimization, or learn\nhypernetworks to produce the weights of a neural field. We instead propose a\nnew paradigm that views the large-scale training of neural representations as a\npart of a partially-observed neural process framework, and leverage neural\nprocess algorithms to solve this task. We demonstrate that this approach\noutperforms both state-of-the-art gradient-based meta-learning approaches and\nhypernetwork approaches.\n","authors":["Jeffrey Gu","Kuan-Chieh Wang","Serena Yeung"],"pdf_url":"https://arxiv.org/pdf/2309.06660v1.pdf","comment":"To appear ICCV 2023"},{"id":"http://arxiv.org/abs/2303.16203v3","updated":"2023-09-13T01:16:45Z","published":"2023-03-28T17:59:56Z","title":"Your Diffusion Model is Secretly a Zero-Shot Classifier","summary":" The recent wave of large-scale text-to-image diffusion models has\ndramatically increased our text-based image generation abilities. These models\ncan generate realistic images for a staggering variety of prompts and exhibit\nimpressive compositional generalization abilities. Almost all use cases thus\nfar have solely focused on sampling; however, diffusion models can also provide\nconditional density estimates, which are useful for tasks beyond image\ngeneration. In this paper, we show that the density estimates from large-scale\ntext-to-image diffusion models like Stable Diffusion can be leveraged to\nperform zero-shot classification without any additional training. Our\ngenerative approach to classification, which we call Diffusion Classifier,\nattains strong results on a variety of benchmarks and outperforms alternative\nmethods of extracting knowledge from diffusion models. Although a gap remains\nbetween generative and discriminative approaches on zero-shot recognition\ntasks, our diffusion-based approach has significantly stronger multimodal\ncompositional reasoning ability than competing discriminative approaches.\nFinally, we use Diffusion Classifier to extract standard classifiers from\nclass-conditional diffusion models trained on ImageNet. Our models achieve\nstrong classification performance using only weak augmentations and exhibit\nqualitatively better \"effective robustness\" to distribution shift. Overall, our\nresults are a step toward using generative over discriminative models for\ndownstream tasks. Results and visualizations at\nhttps://diffusion-classifier.github.io/\n","authors":["Alexander C. Li","Mihir Prabhudesai","Shivam Duggal","Ellis Brown","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2303.16203v3.pdf","comment":"In ICCV 2023. Website at https://diffusion-classifier.github.io/"},{"id":"http://arxiv.org/abs/2309.06658v1","updated":"2023-09-13T01:13:33Z","published":"2023-09-13T01:13:33Z","title":"Dissipative Imitation Learning for Discrete Dynamic Output Feedback\n Control with Sparse Data Sets","summary":" Imitation learning enables the synthesis of controllers for complex\nobjectives and highly uncertain plant models. However, methods to provide\nstability guarantees to imitation learned controllers often rely on large\namounts of data and/or known plant models. In this paper, we explore an\ninput-output (IO) stability approach to dissipative imitation learning, which\nachieves stability with sparse data sets and with little known about the plant\nmodel. A closed-loop stable dynamic output feedback controller is learned using\nexpert data, a coarse IO plant model, and a new constraint to enforce\ndissipativity on the learned controller. While the learning objective is\nnonconvex, iterative convex overbounding (ICO) and projected gradient descent\n(PGD) are explored as methods to successfully learn the controller. This new\nimitation learning method is applied to two unknown plants and compared to\ntraditionally learned dynamic output feedback controller and neural network\ncontroller. With little knowledge of the plant model and a small data set, the\ndissipativity constrained learned controller achieves closed loop stability and\nsuccessfully mimics the behavior of the expert controller, while other methods\noften fail to maintain stability and achieve good performance.\n","authors":["Amy K. Strong","Ethan J. LoCicero","Leila J. Bridgeman"],"pdf_url":"https://arxiv.org/pdf/2309.06658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06553v1","updated":"2023-09-13T01:12:52Z","published":"2023-09-13T01:12:52Z","title":"Offline Prompt Evaluation and Optimization with Inverse Reinforcement\n Learning","summary":" The recent advances in the development of Large Language Models (LLMs) like\nChatGPT have achieved remarkable performance by leveraging human expertise.\nYet, fully eliciting LLMs' potential for complex tasks requires navigating the\nvast search space of natural language prompts. While prompt engineering has\nshown promise, the requisite human-crafted prompts in trial-and-error attempts\nand the associated costs pose significant challenges. Crucially, the efficiency\nof prompt optimization hinges on the costly procedure of prompt evaluation.\nThis work introduces Prompt-OIRL, an approach rooted in offline inverse\nreinforcement learning that seeks to bridge the gap between effective prompt\nevaluation and affordability. Our method draws on offline datasets from expert\nevaluations, employing Inverse-RL to derive a reward model for offline,\nquery-dependent prompt evaluations. The advantages of Prompt-OIRL are manifold:\nit predicts prompt performance, is cost-efficient, produces human-readable\nresults, and efficiently navigates the prompt space. We validate our method\nacross four LLMs and three arithmetic datasets, highlighting its potential as a\nrobust and effective tool for offline prompt evaluation and optimization. Our\ncode as well as the offline datasets are released, and we highlight the\nPrompt-OIRL can be reproduced within a few hours using a single laptop using\nCPU\n","authors":["Hao Sun"],"pdf_url":"https://arxiv.org/pdf/2309.06553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06062v2","updated":"2023-09-13T01:03:30Z","published":"2023-09-12T09:00:17Z","title":"Selection of contributing factors for predicting landslide\n susceptibility using machine learning and deep learning models","summary":" Landslides are a common natural disaster that can cause casualties, property\nsafety threats and economic losses. Therefore, it is important to understand or\npredict the probability of landslide occurrence at potentially risky sites. A\ncommonly used means is to carry out a landslide susceptibility assessment based\non a landslide inventory and a set of landslide contributing factors. This can\nbe readily achieved using machine learning (ML) models such as logistic\nregression (LR), support vector machine (SVM), random forest (RF), extreme\ngradient boosting (Xgboost), or deep learning (DL) models such as convolutional\nneural network (CNN) and long short time memory (LSTM). As the input data for\nthese models, landslide contributing factors have varying influences on\nlandslide occurrence. Therefore, it is logically feasible to select more\nimportant contributing factors and eliminate less relevant ones, with the aim\nof increasing the prediction accuracy of these models. However, selecting more\nimportant factors is still a challenging task and there is no generally\naccepted method. Furthermore, the effects of factor selection using various\nmethods on the prediction accuracy of ML and DL models are unclear. In this\nstudy, the impact of the selection of contributing factors on the accuracy of\nlandslide susceptibility predictions using ML and DL models was investigated.\nFour methods for selecting contributing factors were considered for all the\naforementioned ML and DL models, which included Information Gain Ratio (IGR),\nRecursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least\nAbsolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization\n(HHO). In addition, autoencoder-based factor selection methods for DL models\nwere also investigated. To assess their performances, an exhaustive approach\nwas adopted,...\n","authors":["Cheng Chen","Lei Fan"],"pdf_url":"https://arxiv.org/pdf/2309.06062v2.pdf","comment":"Stochastic Environmental Research and Risk Assessment"},{"id":"http://arxiv.org/abs/2309.06655v1","updated":"2023-09-13T01:02:42Z","published":"2023-09-13T01:02:42Z","title":"Out of Distribution Detection via Domain-Informed Gaussian Process State\n Space Models","summary":" In order for robots to safely navigate in unseen scenarios using\nlearning-based methods, it is important to accurately detect\nout-of-training-distribution (OoD) situations online. Recently, Gaussian\nprocess state-space models (GPSSMs) have proven useful to discriminate\nunexpected observations by comparing them against probabilistic predictions.\nHowever, the capability for the model to correctly distinguish between in- and\nout-of-training distribution observations hinges on the accuracy of these\npredictions, primarily affected by the class of functions the GPSSM kernel can\nrepresent. In this paper, we propose (i) a novel approach to embed existing\ndomain knowledge in the kernel and (ii) an OoD online runtime monitor, based on\nreceding-horizon predictions. Domain knowledge is assumed given as a dataset\ncollected either in simulation or using a nominal model. Numerical results show\nthat the informed kernel yields better regression quality with smaller\ndatasets, as compared to standard kernel choices. We demonstrate the\neffectiveness of the OoD monitor on a real quadruped navigating an indoor\nsetting, which reliably classifies previously unseen terrains.\n","authors":["Alonso Marco","Elias Morley","Claire J. Tomlin"],"pdf_url":"https://arxiv.org/pdf/2309.06655v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.06651v1","updated":"2023-09-13T00:30:32Z","published":"2023-09-13T00:30:32Z","title":"ConR: Contrastive Regularizer for Deep Imbalanced Regression","summary":" Imbalanced distributions are ubiquitous in real-world data. They create\nconstraints on Deep Neural Networks to represent the minority labels and avoid\nbias towards majority labels. The extensive body of imbalanced approaches\naddress categorical label spaces but fail to effectively extend to regression\nproblems where the label space is continuous. Conversely, local and global\ncorrelations among continuous labels provide valuable insights towards\neffectively modelling relationships in feature space. In this work, we propose\nConR, a contrastive regularizer that models global and local label similarities\nin feature space and prevents the features of minority samples from being\ncollapsed into their majority neighbours. Serving the similarities of the\npredictions as an indicator of feature similarities, ConR discerns the\ndissagreements between the label space and feature space and imposes a penalty\non these disagreements. ConR minds the continuous nature of label space with\ntwo main strategies in a contrastive manner: incorrect proximities are\npenalized proportionate to the label similarities and the correct ones are\nencouraged to model local similarities. ConR consolidates essential\nconsiderations into a generic, easy-to-integrate, and efficient method that\neffectively addresses deep imbalanced regression. Moreover, ConR is orthogonal\nto existing approaches and smoothly extends to uni- and multi-dimensional label\nspaces. Our comprehensive experiments show that ConR significantly boosts the\nperformance of all the state-of-the-art methods on three large-scale deep\nimbalanced regression benchmarks. Our code is publicly available in\nhttps://github.com/BorealisAI/ConR.\n","authors":["Mahsa Keramati","Lili Meng","R. David Evans"],"pdf_url":"https://arxiv.org/pdf/2309.06651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.00644v3","updated":"2023-09-13T00:17:41Z","published":"2021-09-01T23:17:30Z","title":"RIFLE: Imputation and Robust Inference from Low Order Marginals","summary":" The ubiquity of missing values in real-world datasets poses a challenge for\nstatistical inference and can prevent similar datasets from being analyzed in\nthe same study, precluding many existing datasets from being used for new\nanalyses. While an extensive collection of packages and algorithms have been\ndeveloped for data imputation, the overwhelming majority perform poorly if\nthere are many missing values and low sample sizes, which are unfortunately\ncommon characteristics in empirical data. Such low-accuracy estimations\nadversely affect the performance of downstream statistical models. We develop a\nstatistical inference framework for regression and classification in the\npresence of missing data without imputation. Our framework, RIFLE (Robust\nInFerence via Low-order moment Estimations), estimates low-order moments of the\nunderlying data distribution with corresponding confidence intervals to learn a\ndistributionally robust model. We specialize our framework to linear regression\nand normal discriminant analysis, and we provide convergence and performance\nguarantees. This framework can also be adapted to impute missing data. In\nnumerical experiments, we compare RIFLE to several state-of-the-art approaches\n(including MICE, Amelia, MissForest, KNN-imputer, MIDA, and Mean Imputer) for\nimputation and inference in the presence of missing values. Our experiments\ndemonstrate that RIFLE outperforms other benchmark algorithms when the\npercentage of missing values is high and/or when the number of data points is\nrelatively small. RIFLE is publicly available at\nhttps://github.com/optimization-for-data-driven-science/RIFLE.\n","authors":["Sina Baharlouei","Kelechi Ogudu","Sze-chuan Suen","Meisam Razaviyayn"],"pdf_url":"https://arxiv.org/pdf/2109.00644v3.pdf","comment":"36 pages, 11 figures"},{"id":"http://arxiv.org/abs/2309.02769v2","updated":"2023-09-13T00:17:19Z","published":"2023-09-06T06:22:18Z","title":"Unifying over-smoothing and over-squashing in graph neural networks: A\n physics informed approach and beyond","summary":" Graph Neural Networks (GNNs) have emerged as one of the leading approaches\nfor machine learning on graph-structured data. Despite their great success,\ncritical computational challenges such as over-smoothing, over-squashing, and\nlimited expressive power continue to impact the performance of GNNs. In this\nstudy, inspired from the time-reversal principle commonly utilized in classical\nand quantum physics, we reverse the time direction of the graph heat equation.\nThe resulted reversing process yields a class of high pass filtering functions\nthat enhance the sharpness of graph node features. Leveraging this concept, we\nintroduce the Multi-Scaled Heat Kernel based GNN (MHKG) by amalgamating diverse\nfiltering functions' effects on node features. To explore more flexible\nfiltering conditions, we further generalize MHKG into a model termed G-MHKG and\nthoroughly show the roles of each element in controlling over-smoothing,\nover-squashing and expressive power. Notably, we illustrate that all\naforementioned issues can be characterized and analyzed via the properties of\nthe filtering functions, and uncover a trade-off between over-smoothing and\nover-squashing: enhancing node feature sharpness will make model suffer more\nfrom over-squashing, and vice versa. Furthermore, we manipulate the time again\nto show how G-MHKG can handle both two issues under mild conditions. Our\nconclusive experiments highlight the effectiveness of proposed models. It\nsurpasses several GNN baseline models in performance across graph datasets\ncharacterized by both homophily and heterophily.\n","authors":["Zhiqi Shao","Dai Shi","Andi Han","Yi Guo","Qibin Zhao","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2309.02769v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15021v2","updated":"2023-09-13T23:46:22Z","published":"2023-05-24T11:04:30Z","title":"EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought","summary":" Embodied AI is a crucial frontier in robotics, capable of planning and\nexecuting action sequences for robots to accomplish long-horizon tasks in\nphysical environments. In this work, we introduce EmbodiedGPT, an end-to-end\nmulti-modal foundation model for embodied AI, empowering embodied agents with\nmulti-modal understanding and execution capabilities. To achieve this, we have\nmade the following efforts: (i) We craft a large-scale embodied planning\ndataset, termed EgoCOT. The dataset consists of carefully selected videos from\nthe Ego4D dataset, along with corresponding high-quality language instructions.\nSpecifically, we generate a sequence of sub-goals with the \"Chain of Thoughts\"\nmode for effective embodied planning. (ii) We introduce an efficient training\napproach to EmbodiedGPT for high-quality plan generation, by adapting a 7B\nlarge language model (LLM) to the EgoCOT dataset via prefix tuning. (iii) We\nintroduce a paradigm for extracting task-related features from LLM-generated\nplanning queries to form a closed loop between high-level planning and\nlow-level control. Extensive experiments show the effectiveness of EmbodiedGPT\non embodied tasks, including embodied planning, embodied control, visual\ncaptioning, and visual question answering. Notably, EmbodiedGPT significantly\nenhances the success rate of the embodied control task by extracting more\neffective features. It has achieved a remarkable 1.6 times increase in success\nrate on the Franka Kitchen benchmark and a 1.3 times increase on the Meta-World\nbenchmark, compared to the BLIP-2 baseline fine-tuned with the Ego4D dataset.\n","authors":["Yao Mu","Qinglong Zhang","Mengkang Hu","Wenhai Wang","Mingyu Ding","Jun Jin","Bin Wang","Jifeng Dai","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2305.15021v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07352v1","updated":"2023-09-13T23:27:45Z","published":"2023-09-13T23:27:45Z","title":"Tackling the dimensions in imaging genetics with CLUB-PLS","summary":" A major challenge in imaging genetics and similar fields is to link\nhigh-dimensional data in one domain, e.g., genetic data, to high dimensional\ndata in a second domain, e.g., brain imaging data. The standard approach in the\narea are mass univariate analyses across genetic factors and imaging\nphenotypes. That entails executing one genome-wide association study (GWAS) for\neach pre-defined imaging measure. Although this approach has been tremendously\nsuccessful, one shortcoming is that phenotypes must be pre-defined.\nConsequently, effects that are not confined to pre-selected regions of interest\nor that reflect larger brain-wide patterns can easily be missed. In this work\nwe introduce a Partial Least Squares (PLS)-based framework, which we term\nCluster-Bootstrap PLS (CLUB-PLS), that can work with large input dimensions in\nboth domains as well as with large sample sizes. One key factor of the\nframework is to use cluster bootstrap to provide robust statistics for single\ninput features in both domains. We applied CLUB-PLS to investigating the\ngenetic basis of surface area and cortical thickness in a sample of 33,000\nsubjects from the UK Biobank. We found 107 genome-wide significant\nlocus-phenotype pairs that are linked to 386 different genes. We found that a\nvast majority of these loci could be technically validated at a high rate:\nusing classic GWAS or Genome-Wide Inferred Statistics (GWIS) we found that 85\nlocus-phenotype pairs exceeded the genome-wide suggestive (P<1e-05) threshold.\n","authors":["Andre Altmann","Ana C Lawry Aquila","Neda Jahanshad","Paul M Thompson","Marco Lorenzi"],"pdf_url":"https://arxiv.org/pdf/2309.07352v1.pdf","comment":"12 pages, 4 Figures, 2 Tables"},{"id":"http://arxiv.org/abs/2303.12814v3","updated":"2023-09-13T23:26:15Z","published":"2023-03-22T04:25:28Z","title":"Nowhere coexpanding functions","summary":" We define a family of $C^1$ functions which we call \"nowhere coexpanding\nfunctions\" that is closed under composition and includes all $C^3$ functions\nwith non-positive Schwarzian derivative. We establish results on the number and\nnature of the fixed points of these functions, including a generalisation of a\nclassic result of Singer.\n","authors":["Andrew Cook","Andy Hammerlindl","Warwick Tucker"],"pdf_url":"https://arxiv.org/pdf/2303.12814v3.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.07344v1","updated":"2023-09-13T22:48:30Z","published":"2023-09-13T22:48:30Z","title":"Efficient Learning of PDEs via Taylor Expansion and Sparse Decomposition\n into Value and Fourier Domains","summary":" Accelerating the learning of Partial Differential Equations (PDEs) from\nexperimental data will speed up the pace of scientific discovery. Previous\nrandomized algorithms exploit sparsity in PDE updates for acceleration. However\nsuch methods are applicable to a limited class of decomposable PDEs, which have\nsparse features in the value domain. We propose Reel, which accelerates the\nlearning of PDEs via random projection and has much broader applicability. Reel\nexploits the sparsity by decomposing dense updates into sparse ones in both the\nvalue and frequency domains. This decomposition enables efficient learning when\nthe source of the updates consists of gradually changing terms across large\nareas (sparse in the frequency domain) in addition to a few rapid updates\nconcentrated in a small set of \"interfacial\" regions (sparse in the value\ndomain). Random projection is then applied to compress the sparse signals for\nlearning. To expand the model applicability, Taylor series expansion is used in\nReel to approximate the nonlinear PDE updates with polynomials in the\ndecomposable form. Theoretically, we derive a constant factor approximation\nbetween the projected loss function and the original one with poly-logarithmic\nnumber of projected dimensions. Experimentally, we provide empirical evidence\nthat our proposed Reel can lead to faster learning of PDE models (70-98%\nreduction in training time when the data is compressed to 1% of its original\nsize) with comparable quality as the non-compressed models.\n","authors":["Md Nasim","Yexiang Xue"],"pdf_url":"https://arxiv.org/pdf/2309.07344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07339v1","updated":"2023-09-13T22:18:38Z","published":"2023-09-13T22:18:38Z","title":"Efficient quantum recurrent reinforcement learning via quantum reservoir\n computing","summary":" Quantum reinforcement learning (QRL) has emerged as a framework to solve\nsequential decision-making tasks, showcasing empirical quantum advantages. A\nnotable development is through quantum recurrent neural networks (QRNNs) for\nmemory-intensive tasks such as partially observable environments. However, QRL\nmodels incorporating QRNN encounter challenges such as inefficient training of\nQRL with QRNN, given that the computation of gradients in QRNN is both\ncomputationally expensive and time-consuming. This work presents a novel\napproach to address this challenge by constructing QRL agents utilizing\nQRNN-based reservoirs, specifically employing quantum long short-term memory\n(QLSTM). QLSTM parameters are randomly initialized and fixed without training.\nThe model is trained using the asynchronous advantage actor-aritic (A3C)\nalgorithm. Through numerical simulations, we validate the efficacy of our\nQLSTM-Reservoir RL framework. Its performance is assessed on standard\nbenchmarks, demonstrating comparable results to a fully trained QLSTM RL model\nwith identical architecture and training settings.\n","authors":["Samuel Yen-Chi Chen"],"pdf_url":"https://arxiv.org/pdf/2309.07339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07332v1","updated":"2023-09-13T22:04:50Z","published":"2023-09-13T22:04:50Z","title":"Reliability-based cleaning of noisy training labels with inductive\n conformal prediction in multi-modal biomedical data mining","summary":" Accurately labeling biomedical data presents a challenge. Traditional\nsemi-supervised learning methods often under-utilize available unlabeled data.\nTo address this, we propose a novel reliability-based training data cleaning\nmethod employing inductive conformal prediction (ICP). This method capitalizes\non a small set of accurately labeled training data and leverages ICP-calculated\nreliability metrics to rectify mislabeled data and outliers within vast\nquantities of noisy training data. The efficacy of the method is validated\nacross three classification tasks within distinct modalities: filtering\ndrug-induced-liver-injury (DILI) literature with title and abstract, predicting\nICU admission of COVID-19 patients through CT radiomics and electronic health\nrecords, and subtyping breast cancer using RNA-sequencing data. Varying levels\nof noise to the training labels were introduced through label permutation.\nResults show significant enhancements in classification performance: accuracy\nenhancement in 86 out of 96 DILI experiments (up to 11.4%), AUROC and AUPRC\nenhancements in all 48 COVID-19 experiments (up to 23.8% and 69.8%), and\naccuracy and macro-average F1 score improvements in 47 out of 48 RNA-sequencing\nexperiments (up to 74.6% and 89.0%). Our method offers the potential to\nsubstantially boost classification performance in multi-modal biomedical\nmachine learning tasks. Importantly, it accomplishes this without necessitating\nan excessive volume of meticulously curated training data.\n","authors":["Xianghao Zhan","Qinmei Xu","Yuanning Zheng","Guangming Lu","Olivier Gevaert"],"pdf_url":"https://arxiv.org/pdf/2309.07332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07315v1","updated":"2023-09-13T21:01:03Z","published":"2023-09-13T21:01:03Z","title":"Traveling Words: A Geometric Interpretation of Transformers","summary":" Transformers have significantly advanced the field of natural language\nprocessing, but comprehending their internal mechanisms remains a challenge. In\nthis paper, we introduce a novel geometric perspective that elucidates the\ninner mechanisms of transformer operations. Our primary contribution is\nillustrating how layer normalization confines the latent features to a\nhyper-sphere, subsequently enabling attention to mold the semantic\nrepresentation of words on this surface. This geometric viewpoint seamlessly\nconnects established properties such as iterative refinement and contextual\nembeddings. We validate our insights by probing a pre-trained 124M parameter\nGPT-2 model. Our findings reveal clear query-key attention patterns in early\nlayers and build upon prior observations regarding the subject-specific nature\nof attention heads at deeper layers. Harnessing these geometric insights, we\npresent an intuitive understanding of transformers, depicting them as processes\nthat model the trajectory of word particles along the hyper-sphere.\n","authors":["Raul Molina"],"pdf_url":"https://arxiv.org/pdf/2309.07315v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04100v2","updated":"2023-09-13T20:52:56Z","published":"2023-09-08T03:41:54Z","title":"Preserved Edge Convolutional Neural Network for Sensitivity Enhancement\n of Deuterium Metabolic Imaging (DMI)","summary":" Purpose: Common to most MRSI techniques, the spatial resolution and the\nminimal scan duration of Deuterium Metabolic Imaging (DMI) are limited by the\nachievable SNR. This work presents a deep learning method for sensitivity\nenhancement of DMI.\n Methods: A convolutional neural network (CNN) was designed to estimate the\n2H-labeled metabolite concentrations from low SNR and distorted DMI FIDs. The\nCNN was trained with synthetic data that represent a range of SNR levels\ntypically encountered in vivo. The estimation precision was further improved by\nfine-tuning the CNN with MRI-based edge-preserving regularization for each DMI\ndataset. The proposed processing method, PReserved Edge ConvolutIonal neural\nnetwork for Sensitivity Enhanced DMI (PRECISE-DMI), was applied to simulation\nstudies and in vivo experiments to evaluate the anticipated improvements in SNR\nand investigate the potential for inaccuracies.\n Results: PRECISE-DMI visually improved the metabolic maps of low SNR\ndatasets, and quantitatively provided higher precision than the standard\nFourier reconstruction. Processing of DMI data acquired in rat brain tumor\nmodels resulted in more precise determination of 2H-labeled lactate and\nglutamate + glutamine levels, at increased spatial resolution (from >8 to 2\n$\\mu$L) or shortened scan time (from 32 to 4 min) compared to standard\nacquisitions. However, rigorous SD-bias analyses showed that overuse of the\nedge-preserving regularization can compromise the accuracy of the results.\n Conclusion: PRECISE-DMI allows a flexible trade-off between enhancing the\nsensitivity of DMI and minimizing the inaccuracies. With typical settings, the\nDMI sensitivity can be improved by 3-fold while retaining the capability to\ndetect local signal variations.\n","authors":["Siyuan Dong","Henk M. De Feyter","Monique A. Thomas","Robin A. de Graaf","James S. Duncan"],"pdf_url":"https://arxiv.org/pdf/2309.04100v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02048v4","updated":"2023-09-13T20:32:50Z","published":"2022-11-03T17:59:55Z","title":"Efficient Spatially Sparse Inference for Conditional GANs and Diffusion\n Models","summary":" During image editing, existing deep generative models tend to re-synthesize\nthe entire output from scratch, including the unedited regions. This leads to a\nsignificant waste of computation, especially for minor editing operations. In\nthis work, we present Spatially Sparse Inference (SSI), a general-purpose\ntechnique that selectively performs computation for edited regions and\naccelerates various generative models, including both conditional GANs and\ndiffusion models. Our key observation is that users prone to gradually edit the\ninput image. This motivates us to cache and reuse the feature maps of the\noriginal image. Given an edited image, we sparsely apply the convolutional\nfilters to the edited regions while reusing the cached features for the\nunedited areas. Based on our algorithm, we further propose Sparse Incremental\nGenerative Engine (SIGE) to convert the computation reduction to latency\nreduction on off-the-shelf hardware. With about $1\\%$-area edits, SIGE\naccelerates DDPM by $3.0\\times$ on NVIDIA RTX 3090 and $4.6\\times$ on Apple M1\nPro GPU, Stable Diffusion by $7.2\\times$ on 3090, and GauGAN by $5.6\\times$ on\n3090 and $5.2\\times$ on M1 Pro GPU. Compared to our conference version, we\nextend SIGE to accommodate attention layers and apply it to Stable Diffusion.\nAdditionally, we offer support for Apple M1 Pro GPU and include more results\nwith large and sequential edits.\n","authors":["Muyang Li","Ji Lin","Chenlin Meng","Stefano Ermon","Song Han","Jun-Yan Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.02048v4.pdf","comment":"NeurIPS 2022 T-PAMI 2023 Website: https://www.cs.cmu.edu/~sige/ Code:\n https://github.com/lmxyy/sige"},{"id":"http://arxiv.org/abs/2202.10629v3","updated":"2023-09-13T20:20:44Z","published":"2022-02-22T02:33:54Z","title":"Model Reprogramming: Resource-Efficient Cross-Domain Machine Learning","summary":" In data-rich domains such as vision, language, and speech, deep learning\nprevails to deliver high-performance task-specific models and can even learn\ngeneral task-agnostic representations for efficient finetuning to downstream\ntasks. However, deep learning in resource-limited domains still faces multiple\nchallenges including (i) limited data, (ii) constrained model development cost,\nand (iii) lack of adequate pre-trained models for effective finetuning. This\npaper provides an overview of model reprogramming to bridge this gap. Model\nreprogramming enables resource-efficient cross-domain machine learning by\nrepurposing and reusing a well-developed pre-trained model from a source domain\nto solve tasks in a target domain without model finetuning, where the source\nand target domains can be vastly different. In many applications, model\nreprogramming outperforms transfer learning and training from scratch. This\npaper elucidates the methodology of model reprogramming, summarizes existing\nuse cases, provides a theoretical explanation of the success of model\nreprogramming, and concludes with a discussion on open-ended research questions\nand opportunities. A list of model reprogramming studies is actively maintained\nand updated at https://github.com/IBM/model-reprogramming.\n","authors":["Pin-Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2202.10629v3.pdf","comment":"Survey paper on model reprogramming; Project repository:\n https://github.com/IBM/model-reprogramming"},{"id":"http://arxiv.org/abs/2309.07289v1","updated":"2023-09-13T20:15:25Z","published":"2023-09-13T20:15:25Z","title":"User Training with Error Augmentation for Electromyogram-based Gesture\n Classification","summary":" We designed and tested a system for real-time control of a user interface by\nextracting surface electromyographic (sEMG) activity from eight electrodes in a\nwrist-band configuration. sEMG data were streamed into a machine-learning\nalgorithm that classified hand gestures in real-time. After an initial model\ncalibration, participants were presented with one of three types of feedback\nduring a human-learning stage: veridical feedback, in which predicted\nprobabilities from the gesture classification algorithm were displayed without\nalteration, modified feedback, in which we applied a hidden augmentation of\nerror to these probabilities, and no feedback. User performance was then\nevaluated in a series of minigames, in which subjects were required to use\neight gestures to manipulate their game avatar to complete a task. Experimental\nresults indicated that, relative to baseline, the modified feedback condition\nled to significantly improved accuracy and improved gesture class separation.\nThese findings suggest that real-time feedback in a gamified user interface\nwith manipulation of feedback may enable intuitive, rapid, and accurate task\nacquisition for sEMG-based gesture recognition applications.\n","authors":["Yunus Bicer","Niklas Smedemark-Margulies","Basak Celik","Elifnur Sunger","Ryan Orendorff","Stephanie Naufel","Tales Imbiriba","Deniz Erdo{ğ}mu{ş}","Eugene Tunik","Mathew Yarossi"],"pdf_url":"https://arxiv.org/pdf/2309.07289v1.pdf","comment":"10 pages, 10 figures"},{"id":"http://arxiv.org/abs/2105.00495v2","updated":"2023-09-13T19:34:50Z","published":"2021-05-02T15:24:33Z","title":"BAARD: Blocking Adversarial Examples by Testing for Applicability,\n Reliability and Decidability","summary":" Adversarial defenses protect machine learning models from adversarial\nattacks, but are often tailored to one type of model or attack. The lack of\ninformation on unknown potential attacks makes detecting adversarial examples\nchallenging. Additionally, attackers do not need to follow the rules made by\nthe defender. To address this problem, we take inspiration from the concept of\nApplicability Domain in cheminformatics. Cheminformatics models struggle to\nmake accurate predictions because only a limited number of compounds are known\nand available for training. Applicability Domain defines a domain based on the\nknown compounds and rejects any unknown compound that falls outside the domain.\nSimilarly, adversarial examples start as harmless inputs, but can be\nmanipulated to evade reliable classification by moving outside the domain of\nthe classifier. We are the first to identify the similarity between\nApplicability Domain and adversarial detection. Instead of focusing on unknown\nattacks, we focus on what is known, the training data. We propose a simple yet\nrobust triple-stage data-driven framework that checks the input globally and\nlocally, and confirms that they are coherent with the model's output. This\nframework can be applied to any classification model and is not limited to\nspecific attacks. We demonstrate these three stages work as one unit,\neffectively detecting various attacks, even for a white-box scenario.\n","authors":["Xinglong Chang","Katharina Dost","Kaiqi Zhao","Ambra Demontis","Fabio Roli","Gill Dobbie","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2105.00495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07277v1","updated":"2023-09-13T19:33:26Z","published":"2023-09-13T19:33:26Z","title":"Unbiased Face Synthesis With Diffusion Models: Are We There Yet?","summary":" Text-to-image diffusion models have achieved widespread popularity due to\ntheir unprecedented image generation capability. In particular, their ability\nto synthesize and modify human faces has spurred research into using generated\nface images in both training data augmentation and model performance\nassessments. In this paper, we study the efficacy and shortcomings of\ngenerative models in the context of face generation. Utilizing a combination of\nqualitative and quantitative measures, including embedding-based metrics and\nuser studies, we present a framework to audit the characteristics of generated\nfaces conditioned on a set of social attributes. We applied our framework on\nfaces generated through state-of-the-art text-to-image diffusion models. We\nidentify several limitations of face image generation that include faithfulness\nto the text prompt, demographic disparities, and distributional shifts.\nFurthermore, we present an analytical model that provides insights into how\ntraining data selection contributes to the performance of generative models.\n","authors":["Harrison Rosenberg","Shimaa Ahmed","Guruprasad V Ramesh","Ramya Korlakai Vinayak","Kassem Fawaz"],"pdf_url":"https://arxiv.org/pdf/2309.07277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.01952v3","updated":"2023-09-13T19:26:40Z","published":"2023-02-03T19:03:10Z","title":"On a continuous time model of gradient descent dynamics and instability\n in deep learning","summary":" The recipe behind the success of deep learning has been the combination of\nneural networks and gradient-based optimization. Understanding the behavior of\ngradient descent however, and particularly its instability, has lagged behind\nits empirical success. To add to the theoretical tools available to study\ngradient descent we propose the principal flow (PF), a continuous time flow\nthat approximates gradient descent dynamics. To our knowledge, the PF is the\nonly continuous flow that captures the divergent and oscillatory behaviors of\ngradient descent, including escaping local minima and saddle points. Through\nits dependence on the eigendecomposition of the Hessian the PF sheds light on\nthe recently observed edge of stability phenomena in deep learning. Using our\nnew understanding of instability we propose a learning rate adaptation method\nwhich enables us to control the trade-off between training stability and test\nset evaluation performance.\n","authors":["Mihaela Rosca","Yan Wu","Chongli Qin","Benoit Dherin"],"pdf_url":"https://arxiv.org/pdf/2302.01952v3.pdf","comment":"Transactions of Machine Learning Research, 2023"},{"id":"http://arxiv.org/abs/2309.00964v2","updated":"2023-09-13T19:17:37Z","published":"2023-09-02T15:16:35Z","title":"eDKM: An Efficient and Accurate Train-time Weight Clustering for Large\n Language Models","summary":" Since Large Language Models or LLMs have demonstrated high-quality\nperformance on many complex language tasks, there is a great interest in\nbringing these LLMs to mobile devices for faster responses and better privacy\nprotection. However, the size of LLMs (i.e., billions of parameters) requires\nhighly effective compression to fit into storage-limited devices. Among many\ncompression techniques, weight-clustering, a form of non-linear quantization,\nis one of the leading candidates for LLM compression, and supported by modern\nsmartphones. Yet, its training overhead is prohibitively significant for LLM\nfine-tuning. Especially, Differentiable KMeans Clustering, or DKM, has shown\nthe state-of-the-art trade-off between compression ratio and accuracy\nregression, but its large memory complexity makes it nearly impossible to apply\nto train-time LLM compression. In this paper, we propose a memory-efficient DKM\nimplementation, eDKM powered by novel techniques to reduce the memory footprint\nof DKM by orders of magnitudes. For a given tensor to be saved on CPU for the\nbackward pass of DKM, we compressed the tensor by applying uniquification and\nsharding after checking if there is no duplicated tensor previously copied to\nCPU. Our experimental results demonstrate that \\prjname can fine-tune and\ncompress a pretrained LLaMA 7B model from 12.6 GB to 2.5 GB (3bit/weight) with\nthe Alpaca dataset by reducing the train-time memory footprint of a decoder\nlayer by 130$\\times$, while delivering good accuracy on broader LLM benchmarks\n(i.e., 77.7% for PIQA, 66.1% for Winograde, and so on).\n","authors":["Minsik Cho","Keivan A. Vahid","Qichen Fu","Saurabh Adya","Carlo C Del Mundo","Mohammad Rastegari","Devang Naik","Peter Zatloukal"],"pdf_url":"https://arxiv.org/pdf/2309.00964v2.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2304.14541v2","updated":"2023-09-13T19:10:18Z","published":"2023-04-27T21:45:21Z","title":"Deep Spatiotemporal Clustering: A Temporal Clustering Approach for\n Multi-dimensional Climate Data","summary":" Clustering high-dimensional spatiotemporal data using an unsupervised\napproach is a challenging problem for many data-driven applications. Existing\nstate-of-the-art methods for unsupervised clustering use different similarity\nand distance functions but focus on either spatial or temporal features of the\ndata. Concentrating on joint deep representation learning of spatial and\ntemporal features, we propose Deep Spatiotemporal Clustering (DSC), a novel\nalgorithm for the temporal clustering of high-dimensional spatiotemporal data\nusing an unsupervised deep learning method. Inspired by the U-net architecture,\nDSC utilizes an autoencoder integrating CNN-RNN layers to learn latent\nrepresentations of the spatiotemporal data. DSC also includes a unique layer\nfor cluster assignment on latent representations that uses the Student's\nt-distribution. By optimizing the clustering loss and data reconstruction loss\nsimultaneously, the algorithm gradually improves clustering assignments and the\nnonlinear mapping between low-dimensional latent feature space and\nhigh-dimensional original data space. A multivariate spatiotemporal climate\ndataset is used to evaluate the efficacy of the proposed method. Our extensive\nexperiments show our approach outperforms both conventional and deep\nlearning-based unsupervised clustering algorithms. Additionally, we compared\nthe proposed model with its various variants (CNN encoder, CNN autoencoder,\nCNN-RNN encoder, CNN-RNN autoencoder, etc.) to get insight into using both the\nCNN and RNN layers in the autoencoder, and our proposed technique outperforms\nthese variants in terms of clustering results.\n","authors":["Omar Faruque","Francis Ndikum Nji","Mostafa Cham","Rohan Mandar Salvi","Xue Zheng","Jianwu Wang"],"pdf_url":"https://arxiv.org/pdf/2304.14541v2.pdf","comment":"Accepted by the European Conference on Machine Learning and\n Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2023)"},{"id":"http://arxiv.org/abs/2309.07265v1","updated":"2023-09-13T18:58:34Z","published":"2023-09-13T18:58:34Z","title":"Safe and Accelerated Deep Reinforcement Learning-based O-RAN Slicing: A\n Hybrid Transfer Learning Approach","summary":" The open radio access network (O-RAN) architecture supports intelligent\nnetwork control algorithms as one of its core capabilities. Data-driven\napplications incorporate such algorithms to optimize radio access network (RAN)\nfunctions via RAN intelligent controllers (RICs). Deep reinforcement learning\n(DRL) algorithms are among the main approaches adopted in the O-RAN literature\nto solve dynamic radio resource management problems. However, despite the\nbenefits introduced by the O-RAN RICs, the practical adoption of DRL algorithms\nin real network deployments falls behind. This is primarily due to the slow\nconvergence and unstable performance exhibited by DRL agents upon deployment\nand when facing previously unseen network conditions. In this paper, we address\nthese challenges by proposing transfer learning (TL) as a core component of the\ntraining and deployment workflows for the DRL-based closed-loop control of\nO-RAN functionalities. To this end, we propose and design a hybrid TL-aided\napproach that leverages the advantages of both policy reuse and distillation TL\nmethods to provide safe and accelerated convergence in DRL-based O-RAN slicing.\nWe conduct a thorough experiment that accommodates multiple services, including\nreal VR gaming traffic to reflect practical scenarios of O-RAN slicing. We also\npropose and implement policy reuse and distillation-aided DRL and non-TL-aided\nDRL as three separate baselines. The proposed hybrid approach shows at least:\n7.7% and 20.7% improvements in the average initial reward value and the\npercentage of converged scenarios, and a 64.6% decrease in reward variance\nwhile maintaining fast convergence and enhancing the generalizability compared\nwith the baselines.\n","authors":["Ahmad M. Nagib","Hatem Abou-Zeid","Hossam S. Hassanein"],"pdf_url":"https://arxiv.org/pdf/2309.07265v1.pdf","comment":"This paper has been accepted for publication in a future issue of\n IEEE Journal on Selected Areas in Communications (JSAC)"},{"id":"http://arxiv.org/abs/2309.07261v1","updated":"2023-09-13T18:53:11Z","published":"2023-09-13T18:53:11Z","title":"Simultaneous inference for generalized linear models with unmeasured\n confounders","summary":" Tens of thousands of simultaneous hypothesis tests are routinely performed in\ngenomic studies to identify differentially expressed genes. However, due to\nunmeasured confounders, many standard statistical approaches may be\nsubstantially biased. This paper investigates the large-scale hypothesis\ntesting problem for multivariate generalized linear models in the presence of\nconfounding effects. Under arbitrary confounding mechanisms, we propose a\nunified statistical estimation and inference framework that harnesses\northogonal structures and integrates linear projections into three key stages.\nIt first leverages multivariate responses to separate marginal and uncorrelated\nconfounding effects, recovering the confounding coefficients' column space.\nSubsequently, latent factors and primary effects are jointly estimated,\nutilizing $\\ell_1$-regularization for sparsity while imposing orthogonality\nonto confounding coefficients. Finally, we incorporate projected and weighted\nbias-correction steps for hypothesis testing. Theoretically, we establish\nvarious effects' identification conditions and non-asymptotic error bounds. We\nshow effective Type-I error control of asymptotic $z$-tests as sample and\nresponse sizes approach infinity. Numerical experiments demonstrate that the\nproposed method controls the false discovery rate by the Benjamini-Hochberg\nprocedure and is more powerful than alternative methods. By comparing\nsingle-cell RNA-seq counts from two groups of samples, we demonstrate the\nsuitability of adjusting confounding effects when significant covariates are\nabsent from the model.\n","authors":["Jin-Hong Du","Larry Wasserman","Kathryn Roeder"],"pdf_url":"https://arxiv.org/pdf/2309.07261v1.pdf","comment":"61 pages, 8 figures"},{"id":"http://arxiv.org/abs/2304.09960v3","updated":"2023-09-13T18:52:02Z","published":"2023-04-19T20:45:01Z","title":"A Latent Space Theory for Emergent Abilities in Large Language Models","summary":" Languages are not created randomly but rather to communicate information.\nThere is a strong association between languages and their underlying meanings,\nresulting in a sparse joint distribution that is heavily peaked according to\ntheir correlations. Moreover, these peak values happen to match with the\nmarginal distribution of languages due to the sparsity. With the advent of LLMs\ntrained on big data and large models, we can now precisely assess the marginal\ndistribution of languages, providing a convenient means of exploring the sparse\nstructures in the joint distribution for effective inferences. In this paper,\nwe categorize languages as either unambiguous or {\\epsilon}-ambiguous and\npresent quantitative results to demonstrate that the emergent abilities of\nLLMs, such as language understanding, in-context learning, chain-of-thought\nprompting, and effective instruction fine-tuning, can all be attributed to\nBayesian inference on the sparse joint distribution of languages.\n","authors":["Hui Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.09960v3.pdf","comment":"17 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.07250v1","updated":"2023-09-13T18:38:41Z","published":"2023-09-13T18:38:41Z","title":"All you need is spin: SU(2) equivariant variational quantum circuits\n based on spin networks","summary":" Variational algorithms require architectures that naturally constrain the\noptimisation space to run efficiently. In geometric quantum machine learning,\none achieves this by encoding group structure into parameterised quantum\ncircuits to include the symmetries of a problem as an inductive bias. However,\nconstructing such circuits is challenging as a concrete guiding principle has\nyet to emerge. In this paper, we propose the use of spin networks, a form of\ndirected tensor network invariant under a group transformation, to devise SU(2)\nequivariant quantum circuit ans\\\"atze -- circuits possessing spin rotation\nsymmetry. By changing to the basis that block diagonalises SU(2) group action,\nthese networks provide a natural building block for constructing parameterised\nequivariant quantum circuits. We prove that our construction is mathematically\nequivalent to other known constructions, such as those based on twirling and\ngeneralised permutations, but more direct to implement on quantum hardware. The\nefficacy of our constructed circuits is tested by solving the ground state\nproblem of SU(2) symmetric Heisenberg models on the one-dimensional triangular\nlattice and on the Kagome lattice. Our results highlight that our equivariant\ncircuits boost the performance of quantum variational algorithms, indicating\nbroader applicability to other real-world problems.\n","authors":["Richard D. P. East","Guillermo Alonso-Linaje","Chae-Yeun Park"],"pdf_url":"https://arxiv.org/pdf/2309.07250v1.pdf","comment":"36+14 pages"},{"id":"http://arxiv.org/abs/2308.11721v2","updated":"2023-09-13T18:26:11Z","published":"2023-08-22T18:16:40Z","title":"When Are Two Lists Better than One?: Benefits and Harms in Joint\n Decision-making","summary":" Historically, much of machine learning research has focused on the\nperformance of the algorithm alone, but recently more attention has been\nfocused on optimizing joint human-algorithm performance. Here, we analyze a\nspecific type of human-algorithm collaboration where the algorithm has access\nto a set of $n$ items, and presents a subset of size $k$ to the human, who\nselects a final item from among those $k$. This scenario could model content\nrecommendation, route planning, or any type of labeling task. Because both the\nhuman and algorithm have imperfect, noisy information about the true ordering\nof items, the key question is: which value of $k$ maximizes the probability\nthat the best item will be ultimately selected? For $k=1$, performance is\noptimized by the algorithm acting alone, and for $k=n$ it is optimized by the\nhuman acting alone. Surprisingly, we show that for multiple of noise models, it\nis optimal to set $k \\in [2, n-1]$ - that is, there are strict benefits to\ncollaborating, even when the human and algorithm have equal accuracy\nseparately. We demonstrate this theoretically for the Mallows model and\nexperimentally for the Random Utilities models of noisy permutations. However,\nwe show this pattern is reversed when the human is anchored on the algorithm's\npresented ordering - the joint system always has strictly worse performance. We\nextend these results to the case where the human and algorithm differ in their\naccuracy levels, showing that there always exist regimes where a more accurate\nagent would strictly benefit from collaborating with a less accurate one, but\nthese regimes are asymmetric between the human and the algorithm's accuracy.\n","authors":["Kate Donahue","Sreenivas Gollapudi","Kostas Kollias"],"pdf_url":"https://arxiv.org/pdf/2308.11721v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07235v1","updated":"2023-09-13T18:15:58Z","published":"2023-09-13T18:15:58Z","title":"Autotuning Apache TVM-based Scientific Applications Using Bayesian\n Optimization","summary":" Apache TVM (Tensor Virtual Machine), an open source machine learning compiler\nframework designed to optimize computations across various hardware platforms,\nprovides an opportunity to improve the performance of dense matrix\nfactorizations such as LU (Lower Upper) decomposition and Cholesky\ndecomposition on GPUs and AI (Artificial Intelligence) accelerators. In this\npaper, we propose a new TVM autotuning framework using Bayesian Optimization\nand use the TVM tensor expression language to implement linear algebra kernels\nsuch as LU, Cholesky, and 3mm. We use these scientific computation kernels to\nevaluate the effectiveness of our methods on a GPU cluster, called Swing, at\nArgonne National Laboratory. We compare the proposed autotuning framework with\nthe TVM autotuning framework AutoTVM with four tuners and find that our\nframework outperforms AutoTVM in most cases.\n","authors":["Xingfu Wu","Praveen Paramasivam","Valerie Taylor"],"pdf_url":"https://arxiv.org/pdf/2309.07235v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07207v1","updated":"2023-09-13T18:00:00Z","published":"2023-09-13T18:00:00Z","title":"EarthPT: a foundation model for Earth Observation","summary":" We introduce EarthPT -- an Earth Observation (EO) pretrained transformer.\nEarthPT is a 700 million parameter decoding transformer foundation model\ntrained in an autoregressive self-supervised manner and developed specifically\nwith EO use-cases in mind. We demonstrate that EarthPT is an effective\nforecaster that can accurately predict future pixel-level surface reflectances\nacross the 400-2300 nm range well into the future. For example, forecasts of\nthe evolution of the Normalised Difference Vegetation Index (NDVI) have a\ntypical error of approximately 0.05 (over a natural range of -1 -> 1) at the\npixel level over a five month test set horizon, out-performing simple\nphase-folded models based on historical averaging. We also demonstrate that\nembeddings learnt by EarthPT hold semantically meaningful information and could\nbe exploited for downstream tasks such as highly granular, dynamic land use\nclassification. Excitingly, we note that the abundance of EO data provides us\nwith -- in theory -- quadrillions of training tokens. Therefore, if we assume\nthat EarthPT follows neural scaling laws akin to those derived for Large\nLanguage Models (LLMs), there is currently no data-imposed limit to scaling\nEarthPT and other similar `Large Observation Models.'\n","authors":["Michael J. Smith","Luke Fleming","James E. Geach"],"pdf_url":"https://arxiv.org/pdf/2309.07207v1.pdf","comment":"7 pages, 4 figures, submitted to NeurIPS CCAI workshop"},{"id":"http://arxiv.org/abs/2309.07200v1","updated":"2023-09-13T15:59:14Z","published":"2023-09-13T15:59:14Z","title":"Latent Representation and Simulation of Markov Processes via Time-Lagged\n Information Bottleneck","summary":" Markov processes are widely used mathematical models for describing dynamic\nsystems in various fields. However, accurately simulating large-scale systems\nat long time scales is computationally expensive due to the short time steps\nrequired for accurate integration. In this paper, we introduce an inference\nprocess that maps complex systems into a simplified representational space and\nmodels large jumps in time. To achieve this, we propose Time-lagged Information\nBottleneck (T-IB), a principled objective rooted in information theory, which\naims to capture relevant temporal features while discarding high-frequency\ninformation to simplify the simulation task and minimize the inference error.\nOur experiments demonstrate that T-IB learns information-optimal\nrepresentations for accurately modeling the statistical properties and dynamics\nof the original process at a selected time lag, outperforming existing\ntime-lagged dimensionality reduction methods.\n","authors":["Marco Federici","Patrick Forré","Ryota Tomioka","Bastiaan S. Veeling"],"pdf_url":"https://arxiv.org/pdf/2309.07200v1.pdf","comment":"10 pages, 14 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.07115v1","updated":"2023-09-13T17:45:41Z","published":"2023-09-13T17:45:41Z","title":"Weakly-Supervised Multi-Task Learning for Audio-Visual Speaker\n Verification","summary":" In this paper, we present a methodology for achieving robust multimodal\nperson representations optimized for open-set audio-visual speaker\nverification. Distance Metric Learning (DML) approaches have typically\ndominated this problem space, owing to strong performance on new and unseen\nclasses. In our work, we explored multitask learning techniques to further\nboost performance of the DML approach and show that an auxiliary task with weak\nlabels can increase the compactness of the learned speaker representation. We\nalso extend the Generalized end-to-end loss (GE2E) to multimodal inputs and\ndemonstrate that it can achieve competitive performance in an audio-visual\nspace. Finally, we introduce a non-synchronous audio-visual sampling random\nstrategy during training time that has shown to improve generalization. Our\nnetwork achieves state of the art performance for speaker verification,\nreporting 0.244%, 0.252%, 0.441% Equal Error Rate (EER) on the three official\ntrial lists of VoxCeleb1-O/E/H, which is to our knowledge, the best published\nresults on VoxCeleb1-E and VoxCeleb1-H.\n","authors":["Anith Selvakumar","Homa Fashandi"],"pdf_url":"https://arxiv.org/pdf/2309.07115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07051v1","updated":"2023-09-13T16:07:25Z","published":"2023-09-13T16:07:25Z","title":"UnifiedGesture: A Unified Gesture Synthesis Model for Multiple Skeletons","summary":" The automatic co-speech gesture generation draws much attention in computer\nanimation. Previous works designed network structures on individual datasets,\nwhich resulted in a lack of data volume and generalizability across different\nmotion capture standards. In addition, it is a challenging task due to the weak\ncorrelation between speech and gestures. To address these problems, we present\nUnifiedGesture, a novel diffusion model-based speech-driven gesture synthesis\napproach, trained on multiple gesture datasets with different skeletons.\nSpecifically, we first present a retargeting network to learn latent\nhomeomorphic graphs for different motion capture standards, unifying the\nrepresentations of various gestures while extending the dataset. We then\ncapture the correlation between speech and gestures based on a diffusion model\narchitecture using cross-local attention and self-attention to generate better\nspeech-matched and realistic gestures. To further align speech and gesture and\nincrease diversity, we incorporate reinforcement learning on the discrete\ngesture units with a learned reward function. Extensive experiments show that\nUnifiedGesture outperforms recent approaches on speech-driven gesture\ngeneration in terms of CCA, FGD, and human-likeness. All code, pre-trained\nmodels, databases, and demos are available to the public at\nhttps://github.com/YoungSeng/UnifiedGesture.\n","authors":["Sicheng Yang","Zilin Wang","Zhiyong Wu","Minglei Li","Zhensong Zhang","Qiaochu Huang","Lei Hao","Songcen Xu","Xiaofei Wu","changpeng yang","Zonghong Dai"],"pdf_url":"https://arxiv.org/pdf/2309.07051v1.pdf","comment":"16 pages, 11 figures, ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.06978v1","updated":"2023-09-13T14:13:08Z","published":"2023-09-13T14:13:08Z","title":"Differentiable JPEG: The Devil is in the Details","summary":" JPEG remains one of the most widespread lossy image coding methods. However,\nthe non-differentiable nature of JPEG restricts the application in deep\nlearning pipelines. Several differentiable approximations of JPEG have recently\nbeen proposed to address this issue. This paper conducts a comprehensive review\nof existing diff. JPEG approaches and identifies critical details that have\nbeen missed by previous methods. To this end, we propose a novel diff. JPEG\napproach, overcoming previous limitations. Our approach is differentiable\nw.r.t. the input image, the JPEG quality, the quantization tables, and the\ncolor conversion parameters. We evaluate the forward and backward performance\nof our diff. JPEG approach against existing methods. Additionally, extensive\nablations are performed to evaluate crucial design choices. Our proposed diff.\nJPEG resembles the (non-diff.) reference implementation best, significantly\nsurpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For\nstrong compression rates, we can even improve PSNR by $9.51$dB. Strong\nadversarial attack results are yielded by our diff. JPEG, demonstrating the\neffective gradient approximation. Our code is available at\nhttps://github.com/necla-ml/Diff-JPEG.\n","authors":["Christoph Reich","Biplob Debnath","Deep Patel","Srimat Chakradhar"],"pdf_url":"https://arxiv.org/pdf/2309.06978v1.pdf","comment":"Accepted at WACV 2024. Project page:\n https://christophreich1996.github.io/differentiable_jpeg/"},{"id":"http://arxiv.org/abs/2309.06877v1","updated":"2023-09-13T10:53:12Z","published":"2023-09-13T10:53:12Z","title":"Video Infringement Detection via Feature Disentanglement and Mutual\n Information Maximization","summary":" The self-media era provides us tremendous high quality videos. Unfortunately,\nfrequent video copyright infringements are now seriously damaging the interests\nand enthusiasm of video creators. Identifying infringing videos is therefore a\ncompelling task. Current state-of-the-art methods tend to simply feed\nhigh-dimensional mixed video features into deep neural networks and count on\nthe networks to extract useful representations. Despite its simplicity, this\nparadigm heavily relies on the original entangled features and lacks\nconstraints guaranteeing that useful task-relevant semantics are extracted from\nthe features.\n In this paper, we seek to tackle the above challenges from two aspects: (1)\nWe propose to disentangle an original high-dimensional feature into multiple\nsub-features, explicitly disentangling the feature into exclusive\nlower-dimensional components. We expect the sub-features to encode\nnon-overlapping semantics of the original feature and remove redundant\ninformation.\n (2) On top of the disentangled sub-features, we further learn an auxiliary\nfeature to enhance the sub-features. We theoretically analyzed the mutual\ninformation between the label and the disentangled features, arriving at a loss\nthat maximizes the extraction of task-relevant information from the original\nfeature.\n Extensive experiments on two large-scale benchmark datasets (i.e., SVD and\nVCSL) demonstrate that our method achieves 90.1% TOP-100 mAP on the large-scale\nSVD dataset and also sets the new state-of-the-art on the VCSL benchmark\ndataset. Our code and model have been released at\nhttps://github.com/yyyooooo/DMI/, hoping to contribute to the community.\n","authors":["Zhenguang Liu","Xinyang Yu","Ruili Wang","Shuai Ye","Zhe Ma","Jianfeng Dong","Sifeng He","Feng Qian","Xiaobo Zhang","Roger Zimmermann","Lei Yang"],"pdf_url":"https://arxiv.org/pdf/2309.06877v1.pdf","comment":"This paper is accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2305.06908v2","updated":"2023-09-13T10:50:02Z","published":"2023-05-11T15:51:46Z","title":"CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency\n Model","summary":" Denoising diffusion probabilistic models (DDPMs) have shown promising\nperformance for speech synthesis. However, a large number of iterative steps\nare required to achieve high sample quality, which restricts the inference\nspeed. Maintaining sample quality while increasing sampling speed has become a\nchallenging task. In this paper, we propose a \"Co\"nsistency \"Mo\"del-based\n\"Speech\" synthesis method, CoMoSpeech, which achieve speech synthesis through a\nsingle diffusion sampling step while achieving high audio quality. The\nconsistency constraint is applied to distill a consistency model from a\nwell-designed diffusion-based teacher model, which ultimately yields superior\nperformances in the distilled CoMoSpeech. Our experiments show that by\ngenerating audio recordings by a single sampling step, the CoMoSpeech achieves\nan inference speed more than 150 times faster than real-time on a single NVIDIA\nA100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based\nspeech synthesis truly practical. Meanwhile, objective and subjective\nevaluations on text-to-speech and singing voice synthesis show that the\nproposed teacher models yield the best audio quality, and the one-step sampling\nbased CoMoSpeech achieves the best inference speed with better or comparable\naudio quality to other conventional multi-step diffusion model baselines. Audio\nsamples are available at https://comospeech.github.io/.\n","authors":["Zhen Ye","Wei Xue","Xu Tan","Jie Chen","Qifeng Liu","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2305.06908v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.06844v1","updated":"2023-09-13T09:49:20Z","published":"2023-09-13T09:49:20Z","title":"Gpachov at CheckThat! 2023: A Diverse Multi-Approach Ensemble for\n Subjectivity Detection in News Articles","summary":" The wide-spread use of social networks has given rise to subjective,\nmisleading, and even false information on the Internet. Thus, subjectivity\ndetection can play an important role in ensuring the objectiveness and the\nquality of a piece of information. This paper presents the solution built by\nthe Gpachov team for the CLEF-2023 CheckThat! lab Task~2 on subjectivity\ndetection. Three different research directions are explored. The first one is\nbased on fine-tuning a sentence embeddings encoder model and dimensionality\nreduction. The second one explores a sample-efficient few-shot learning model.\nThe third one evaluates fine-tuning a multilingual transformer on an altered\ndataset, using data from multiple languages. Finally, the three approaches are\ncombined in a simple majority voting ensemble, resulting in 0.77 macro F1 on\nthe test set and achieving 2nd place on the English subtask.\n","authors":["Georgi Pachov","Dimitar Dimitrov","Ivan Koychev","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2309.06844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06745v1","updated":"2023-09-13T06:31:35Z","published":"2023-09-13T06:31:35Z","title":"VEATIC: Video-based Emotion and Affect Tracking in Context Dataset","summary":" Human affect recognition has been a significant topic in psychophysics and\ncomputer vision. However, the currently published datasets have many\nlimitations. For example, most datasets contain frames that contain only\ninformation about facial expressions. Due to the limitations of previous\ndatasets, it is very hard to either understand the mechanisms for affect\nrecognition of humans or generalize well on common cases for computer vision\nmodels trained on those datasets. In this work, we introduce a brand new large\ndataset, the Video-based Emotion and Affect Tracking in Context Dataset\n(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has\n124 video clips from Hollywood movies, documentaries, and home videos with\ncontinuous valence and arousal ratings of each frame via real-time annotation.\nAlong with the dataset, we propose a new computer vision task to infer the\naffect of the selected character via both context and character information in\neach video frame. Additionally, we propose a simple model to benchmark this new\ncomputer vision task. We also compare the performance of the pretrained model\nusing our dataset with other similar datasets. Experiments show the competing\nresults of our pretrained model via VEATIC, indicating the generalizability of\nVEATIC. Our dataset is available at https://veatic.github.io.\n","authors":["Zhihang Ren","Jefferson Ortega","Yifan Wang","Zhimin Chen","David Whitney","Yunhui Guo","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2309.06745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06728v1","updated":"2023-09-13T05:05:47Z","published":"2023-09-13T05:05:47Z","title":"Leveraging Foundation models for Unsupervised Audio-Visual Segmentation","summary":" Audio-Visual Segmentation (AVS) aims to precisely outline audible objects in\na visual scene at the pixel level. Existing AVS methods require fine-grained\nannotations of audio-mask pairs in supervised learning fashion. This limits\ntheir scalability since it is time consuming and tedious to acquire such\ncross-modality pixel level labels. To overcome this obstacle, in this work we\nintroduce unsupervised audio-visual segmentation with no need for task-specific\ndata annotations and model training. For tackling this newly proposed problem,\nwe formulate a novel Cross-Modality Semantic Filtering (CMSF) approach to\naccurately associate the underlying audio-mask pairs by leveraging the\noff-the-shelf multi-modal foundation models (e.g., detection [1], open-world\nsegmentation [2] and multi-modal alignment [3]). Guiding the proposal\ngeneration by either audio or visual cues, we design two training-free\nvariants: AT-GDINO-SAM and OWOD-BIND. Extensive experiments on the AVS-Bench\ndataset show that our unsupervised approach can perform well in comparison to\nprior art supervised counterparts across complex scenarios with multiple\nauditory objects. Particularly, in situations where existing supervised AVS\nmethods struggle with overlapping foreground objects, our models still excel in\naccurately segmenting overlapped auditory objects. Our code will be publicly\nreleased.\n","authors":["Swapnil Bhosale","Haosen Yang","Diptesh Kanojia","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.06728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06723v1","updated":"2023-09-13T04:54:44Z","published":"2023-09-13T04:54:44Z","title":"PIAVE: A Pose-Invariant Audio-Visual Speaker Extraction Network","summary":" It is common in everyday spoken communication that we look at the turning\nhead of a talker to listen to his/her voice. Humans see the talker to listen\nbetter, so do machines. However, previous studies on audio-visual speaker\nextraction have not effectively handled the varying talking face. This paper\nstudies how to take full advantage of the varying talking face. We propose a\nPose-Invariant Audio-Visual Speaker Extraction Network (PIAVE) that\nincorporates an additional pose-invariant view to improve audio-visual speaker\nextraction. Specifically, we generate the pose-invariant view from each\noriginal pose orientation, which enables the model to receive a consistent\nfrontal view of the talker regardless of his/her head pose, therefore, forming\na multi-view visual input for the speaker. Experiments on the multi-view MEAD\nand in-the-wild LRS3 dataset demonstrate that PIAVE outperforms the\nstate-of-the-art and is more robust to pose variations.\n","authors":["Qinghua Liu","Meng Ge","Zhizheng Wu","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2309.06723v1.pdf","comment":"Interspeech 2023"},{"id":"http://arxiv.org/abs/2308.04025v2","updated":"2023-09-13T04:52:00Z","published":"2023-08-08T03:43:24Z","title":"MSAC: Multiple Speech Attribute Control Method for Reliable Speech\n Emotion Recognition","summary":" Despite significant progress, speech emotion recognition (SER) remains\nchallenging due to inherent complexity and ambiguity of the emotion attribute,\nparticularly in wild world. Whereas current studies primarily focus on\nrecognition and generalization abilities, this work pioneers an investigation\ninto the reliability of SER methods and explores the modeling of speech emotion\nbased on data distribution across various speech attributes. Specifically, a\nnovel CNN-based SER model that adopts additive margin softmax loss is first\ndesgined. Second, a novel multiple speech attribute control method MSAC is\nproposed to explicitly control speech attributes, enabling the model to be less\naffected by emotion-agnostic features and extract fine-grained emotion-related\nrepresentations. Third, we make a first attempt to examine the reliability of\nour proposed unified SER workflow using the out-of-distribution detection\nmethod. Experiments on both single and cross-corpus SER scenarios show that our\nproposed unified SER workflow consistently outperforms the baseline in all\naspects. Remarkably, in single-corpus SER, the proposed SER workflow achieves\nsuperior recognition results with a WAR of 72.97% and a UAR of 71.76% on the\nIEMOCAP corpus.\n","authors":["Yu Pan","Yuguang Yang","Yuheng Huang","Jingjing Yin","Yanni Hu","Heng Lu","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2308.04025v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2306.07848v8","updated":"2023-09-13T04:48:23Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Accurate Speech Emotion Recognition","summary":" Contrastive cross-modality pretraining has recently exhibited impressive\nsuccess in diverse fields, whereas there is limited research on their merits in\nspeech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind\nof gender-attribute-enhanced contrastive language-audio pretraining (CLAP)\nmethod for SER. Specifically, we first construct an effective emotion CLAP\n(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given\nthe significance of gender information in SER, two novel multi-task learning\nbased GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP)\nmodels are further proposed to incorporate gender information of speech\nsignals, forming more reasonable objectives. Experiments on IEMOCAP indicate\nthat our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with\ndifferent pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP\nobtains the best UAR of 81.43% and WAR of 83.16%, which performs better than\nstate-of-the-art SER methods by at least 3%. Our system is open-sourced on\nGithub.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Wen Fei","Jixun Yao","Heng Lu","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.07848v8.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2111.12663v3","updated":"2023-09-13T21:54:35Z","published":"2021-11-24T17:51:16Z","title":"PointPCA: Point Cloud Objective Quality Assessment Using PCA-Based\n Descriptors","summary":" Point clouds denote a prominent solution for the representation of 3D\nphoto-realistic content in immersive applications. Similarly to other imaging\nmodalities, quality predictions for point cloud contents are vital for a wide\nrange of applications, enabling trade-off optimizations between data quality\nand data size in every processing step from acquisition to rendering. In this\nwork, we focus on use cases that consider human end-users consuming point cloud\ncontents and, hence, we concentrate on visual quality metrics. In particular,\nwe propose a set of perceptually relevant descriptors based on Principal\nComponent Analysis (PCA) decomposition, which is applied to both geometry and\ntexture data for full-reference point cloud quality assessment. Statistical\nfeatures are derived from these descriptors to characterize local shape and\nappearance properties for both a reference and a distorted point cloud. The\nextracted statistical features are subsequently compared to provide\ncorresponding predictions of visual quality for the distorted point cloud. As\npart of our method, a learning-based approach is proposed to fuse these\nindividual predictors to a unified perceptual score. We validate the accuracy\nof the individual predictors, as well as the unified quality scores obtained\nafter regression against subjectively annotated datasets, showing that our\nmetric outperforms state-of-the-art solutions. Insights regarding design\ndecisions are provided through exploratory studies, evaluating the performance\nof our metric under different parameter configurations, attribute domains,\ncolor spaces, and regression models. A software implementation of the proposed\nmetric is made available at the following link:\nhttps://github.com/cwi-dis/pointpca_suite.\n","authors":["Evangelos Alexiou","Xuemei Zhou","Irene Viola","Pablo Cesar"],"pdf_url":"https://arxiv.org/pdf/2111.12663v3.pdf","comment":"14 pages, 7 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.07314v1","updated":"2023-09-13T21:00:09Z","published":"2023-09-13T21:00:09Z","title":"AudioSR: Versatile Audio Super-resolution at Scale","summary":" Audio super-resolution is a fundamental task that predicts high-frequency\ncomponents for low-resolution audio, enhancing audio quality in digital\napplications. Previous methods have limitations such as the limited scope of\naudio types (e.g., music, speech) and specific bandwidth settings they can\nhandle (e.g., 4kHz to 8kHz). In this paper, we introduce a diffusion-based\ngenerative model, AudioSR, that is capable of performing robust audio\nsuper-resolution on versatile audio types, including sound effects, music, and\nspeech. Specifically, AudioSR can upsample any input audio signal within the\nbandwidth range of 2kHz to 16kHz to a high-resolution audio signal at 24kHz\nbandwidth with a sampling rate of 48kHz. Extensive objective evaluation on\nvarious audio super-resolution benchmarks demonstrates the strong result\nachieved by the proposed model. In addition, our subjective evaluation shows\nthat AudioSR can acts as a plug-and-play module to enhance the generation\nquality of a wide range of audio generative models, including AudioLDM,\nFastspeech2, and MusicGen. Our code and demo are available at\nhttps://audioldm.github.io/audiosr.\n","authors":["Haohe Liu","Ke Chen","Qiao Tian","Wenwu Wang","Mark D. Plumbley"],"pdf_url":"https://arxiv.org/pdf/2309.07314v1.pdf","comment":"Under review. Demo and code: https://audioldm.github.io/audiosr"}]},"2023-09-14T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2309.07915v1","updated":"2023-09-14T17:59:17Z","published":"2023-09-14T17:59:17Z","title":"MMICL: Empowering Vision-language Model with Multi-Modal In-Context\n Learning","summary":" Starting from the resurgence of deep learning, vision-language models (VLMs)\nbenefiting from large language models (LLMs) have never been so popular.\nHowever, while LLMs can utilize extensive background knowledge and task\ninformation with in-context learning, most VLMs still struggle with\nunderstanding complex multi-modal prompts with multiple images. The issue can\ntraced back to the architectural design of VLMs or pre-training data.\nSpecifically, the current VLMs primarily emphasize utilizing multi-modal data\nwith a single image some, rather than multi-modal prompts with interleaved\nmultiple images and text. Even though some newly proposed VLMs could handle\nuser prompts with multiple images, pre-training data does not provide more\nsophisticated multi-modal prompts than interleaved image and text crawled from\nthe web. We propose MMICL to address the issue by considering both the model\nand data perspectives. We introduce a well-designed architecture capable of\nseamlessly integrating visual and textual context in an interleaved manner and\nMIC dataset to reduce the gap between the training data and the complex user\nprompts in real-world applications, including: 1) multi-modal context with\ninterleaved images and text, 2) textual references for each image, and 3)\nmulti-image data with spatial, logical, or temporal relationships. Our\nexperiments confirm that MMICL achieves new stat-of-the-art zero-shot and\nfew-shot performance on a wide range of general vision-language tasks,\nespecially for complex reasoning benchmarks including MME and MMBench. Our\nanalysis demonstrates that MMICL effectively deals with the challenge of\ncomplex multi-modal prompt understanding. The experiments on ScienceQA-IMG also\nshow that MMICL successfully alleviates the issue of language bias in VLMs,\nwhich we believe is the reason behind the advanced performance of MMICL.\n","authors":["Haozhe Zhao","Zefan Cai","Shuzheng Si","Xiaojian Ma","Kaikai An","Liang Chen","Zixuan Liu","Sheng Wang","Wenjuan Han","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2309.07915v1.pdf","comment":"Code, dataset, checkpoints, and demos are available at\n \\href{https://github.com/HaozheZhao/MIC}{https://github.com/HaozheZhao/MIC}"},{"id":"http://arxiv.org/abs/2309.07900v1","updated":"2023-09-14T17:48:34Z","published":"2023-09-14T17:48:34Z","title":"Ambiguity-Aware In-Context Learning with Large Language Models","summary":" In-context learning (ICL) i.e. showing LLMs only a few task-specific\ndemonstrations has led to downstream gains with no task-specific fine-tuning\nrequired. However, LLMs are sensitive to the choice of prompts, and therefore a\ncrucial research question is how to select good demonstrations for ICL. One\neffective strategy is leveraging semantic similarity between the ICL\ndemonstrations and test inputs by using a text retriever, which however is\nsub-optimal as that does not consider the LLM's existing knowledge about that\ntask. From prior work (Min et al., 2022), we already know that labels paired\nwith the demonstrations bias the model predictions. This leads us to our\nhypothesis whether considering LLM's existing knowledge about the task,\nespecially with respect to the output label space can help in a better\ndemonstration selection strategy. Through extensive experimentation on three\ntext classification tasks, we find that it is beneficial to not only choose\nsemantically similar ICL demonstrations but also to choose those demonstrations\nthat help resolve the inherent label ambiguity surrounding the test example.\nInterestingly, we find that including demonstrations that the LLM previously\nmis-classified and also fall on the test example's decision boundary, brings\nthe most performance gain.\n","authors":["Lingyu Gao","Aditi Chaudhary","Krishna Srinivasan","Kazuma Hashimoto","Karthik Raman","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2309.07900v1.pdf","comment":"13 pages in total"},{"id":"http://arxiv.org/abs/2309.07875v1","updated":"2023-09-14T17:23:37Z","published":"2023-09-14T17:23:37Z","title":"Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language\n Models that Follow Instructions","summary":" Training large language models to follow instructions makes them perform\nbetter on a wide range of tasks, generally becoming more helpful. However, a\nperfectly helpful model will follow even the most malicious instructions and\nreadily generate harmful content. In this paper, we raise concerns over the\nsafety of models that only emphasize helpfulness, not safety, in their\ninstruction-tuning. We show that several popular instruction-tuned models are\nhighly unsafe. Moreover, we show that adding just 3% safety examples (a few\nhundred demonstrations) in the training set when fine-tuning a model like LLaMA\ncan substantially improve their safety. Our safety-tuning does not make models\nsignificantly less capable or helpful as measured by standard benchmarks.\nHowever, we do find a behavior of exaggerated safety, where too much\nsafety-tuning makes models refuse to respond to reasonable prompts that\nsuperficially resemble unsafe ones. Our study sheds light on trade-offs in\ntraining LLMs to follow instructions and exhibit safe behavior.\n","authors":["Federico Bianchi","Mirac Suzgun","Giuseppe Attanasio","Paul Röttger","Dan Jurafsky","Tatsunori Hashimoto","James Zou"],"pdf_url":"https://arxiv.org/pdf/2309.07875v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07870v1","updated":"2023-09-14T17:18:25Z","published":"2023-09-14T17:18:25Z","title":"Agents: An Open-source Framework for Autonomous Language Agents","summary":" Recent advances on large language models (LLMs) enable researchers and\ndevelopers to build autonomous language agents that can automatically solve\nvarious tasks and interact with environments, humans, and other agents using\nnatural language interfaces. We consider language agents as a promising\ndirection towards artificial general intelligence and release Agents, an\nopen-source library with the goal of opening up these advances to a wider\nnon-specialist audience. Agents is carefully engineered to support important\nfeatures including planning, memory, tool usage, multi-agent communication, and\nfine-grained symbolic control. Agents is user-friendly as it enables\nnon-specialists to build, customize, test, tune, and deploy state-of-the-art\nautonomous language agents without much coding. The library is also\nresearch-friendly as its modularized design makes it easily extensible for\nresearchers. Agents is available at https://github.com/aiwaves-cn/agents.\n","authors":["Wangchunshu Zhou","Yuchen Eleanor Jiang","Long Li","Jialong Wu","Tiannan Wang","Shi Qiu","Jintian Zhang","Jing Chen","Ruipu Wu","Shuai Wang","Shiding Zhu","Jiyu Chen","Wentao Zhang","Ningyu Zhang","Huajun Chen","Peng Cui","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2309.07870v1.pdf","comment":"Code available at https://github.com/aiwaves-cn/agents"},{"id":"http://arxiv.org/abs/2309.07864v1","updated":"2023-09-14T17:12:03Z","published":"2023-09-14T17:12:03Z","title":"The Rise and Potential of Large Language Model Based Agents: A Survey","summary":" For a long time, humanity has pursued artificial intelligence (AI) equivalent\nto or surpassing the human level, with AI agents considered a promising vehicle\nfor this pursuit. AI agents are artificial entities that sense their\nenvironment, make decisions, and take actions. Many efforts have been made to\ndevelop intelligent AI agents since the mid-20th century. However, these\nefforts have mainly focused on advancement in algorithms or training strategies\nto enhance specific capabilities or performance on particular tasks. Actually,\nwhat the community lacks is a sufficiently general and powerful model to serve\nas a starting point for designing AI agents that can adapt to diverse\nscenarios. Due to the versatile and remarkable capabilities they demonstrate,\nlarge language models (LLMs) are regarded as potential sparks for Artificial\nGeneral Intelligence (AGI), offering hope for building general AI agents. Many\nresearch efforts have leveraged LLMs as the foundation to build AI agents and\nhave achieved significant progress. We start by tracing the concept of agents\nfrom its philosophical origins to its development in AI, and explain why LLMs\nare suitable foundations for AI agents. Building upon this, we present a\nconceptual framework for LLM-based agents, comprising three main components:\nbrain, perception, and action, and the framework can be tailored to suit\ndifferent applications. Subsequently, we explore the extensive applications of\nLLM-based agents in three aspects: single-agent scenarios, multi-agent\nscenarios, and human-agent cooperation. Following this, we delve into agent\nsocieties, exploring the behavior and personality of LLM-based agents, the\nsocial phenomena that emerge when they form societies, and the insights they\noffer for human society. Finally, we discuss a range of key topics and open\nproblems within the field.\n","authors":["Zhiheng Xi","Wenxiang Chen","Xin Guo","Wei He","Yiwen Ding","Boyang Hong","Ming Zhang","Junzhe Wang","Senjie Jin","Enyu Zhou","Rui Zheng","Xiaoran Fan","Xiao Wang","Limao Xiong","Qin Liu","Yuhao Zhou","Weiran Wang","Changhao Jiang","Yicheng Zou","Xiangyang Liu","Zhangyue Yin","Shihan Dou","Rongxiang Weng","Wensen Cheng","Qi Zhang","Wenjuan Qin","Yongyan Zheng","Xipeng Qiu","Xuanjing Huan","Tao Gui"],"pdf_url":"https://arxiv.org/pdf/2309.07864v1.pdf","comment":"86 pages, 12 figures"},{"id":"http://arxiv.org/abs/2309.07861v1","updated":"2023-09-14T17:10:39Z","published":"2023-09-14T17:10:39Z","title":"CiwaGAN: Articulatory information exchange","summary":" Humans encode information into sounds by controlling articulators and decode\ninformation from sounds using the auditory apparatus. This paper introduces\nCiwaGAN, a model of human spoken language acquisition that combines\nunsupervised articulatory modeling with an unsupervised model of information\nexchange through the auditory modality. While prior research includes\nunsupervised articulatory modeling and information exchange separately, our\nmodel is the first to combine the two components. The paper also proposes an\nimproved articulatory model with more interpretable internal representations.\nThe proposed CiwaGAN model is the most realistic approximation of human spoken\nlanguage acquisition using deep learning. As such, it is useful for cognitively\nplausible simulations of the human speech act.\n","authors":["Gašper Beguš","Thomas Lu","Alan Zhou","Peter Wu","Gopala K. Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2309.07861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12966v2","updated":"2023-09-14T17:08:39Z","published":"2023-08-24T17:59:17Z","title":"Qwen-VL: A Versatile Vision-Language Model for Understanding,\n Localization, Text Reading, and Beyond","summary":" We introduce the Qwen-VL series, a set of large-scale vision-language models\n(LVLMs) designed to perceive and understand both text and images. Comprising\nQwen-VL and Qwen-VL-Chat, these models exhibit remarkable performance in tasks\nlike image captioning, question answering, visual localization, and flexible\ninteraction. The evaluation covers a wide range of tasks including zero-shot\ncaptioning, visual or document visual question answering, and grounding. We\ndemonstrate the Qwen-VL outperforms existing LVLMs. We present their\narchitecture, training, capabilities, and performance, highlighting their\ncontributions to advancing multimodal artificial intelligence. Code, demo and\nmodels are available at https://github.com/QwenLM/Qwen-VL.\n","authors":["Jinze Bai","Shuai Bai","Shusheng Yang","Shijie Wang","Sinan Tan","Peng Wang","Junyang Lin","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12966v2.pdf","comment":"Code, demo and models are available at\n https://github.com/QwenLM/Qwen-VL"},{"id":"http://arxiv.org/abs/2309.07852v1","updated":"2023-09-14T16:54:34Z","published":"2023-09-14T16:54:34Z","title":"ExpertQA: Expert-Curated Questions and Attributed Answers","summary":" As language models are adapted by a more sophisticated and diverse set of\nusers, the importance of guaranteeing that they provide factually correct\ninformation supported by verifiable sources is critical across fields of study\n& professions. This is especially the case for high-stakes fields, such as\nmedicine and law, where the risk of propagating false information is high and\ncan lead to undesirable societal consequences. Previous work studying\nfactuality and attribution has not focused on analyzing these characteristics\nof language model outputs in domain-specific scenarios. In this work, we\npresent an evaluation study analyzing various axes of factuality and\nattribution provided in responses from a few systems, by bringing domain\nexperts in the loop. Specifically, we first collect expert-curated questions\nfrom 484 participants across 32 fields of study, and then ask the same experts\nto evaluate generated responses to their own questions. We also ask experts to\nrevise answers produced by language models, which leads to ExpertQA, a\nhigh-quality long-form QA dataset with 2177 questions spanning 32 fields, along\nwith verified answers and attributions for claims in the answers.\n","authors":["Chaitanya Malaviya","Subin Lee","Sihao Chen","Elizabeth Sieber","Mark Yatskar","Dan Roth"],"pdf_url":"https://arxiv.org/pdf/2309.07852v1.pdf","comment":"Dataset & code is available at\n https://github.com/chaitanyamalaviya/expertqa"},{"id":"http://arxiv.org/abs/2309.07822v1","updated":"2023-09-14T16:16:40Z","published":"2023-09-14T16:16:40Z","title":"CATfOOD: Counterfactual Augmented Training for Improving Out-of-Domain\n Performance and Calibration","summary":" In recent years, large language models (LLMs) have shown remarkable\ncapabilities at scale, particularly at generating text conditioned on a prompt.\nIn our work, we investigate the use of LLMs to augment training data of small\nlanguage models~(SLMs) with automatically generated counterfactual~(CF)\ninstances -- i.e. minimally altered inputs -- in order to improve\nout-of-domain~(OOD) performance of SLMs in the extractive question\nanswering~(QA) setup. We show that, across various LLM generators, such data\naugmentation consistently enhances OOD performance and improves model\ncalibration for both confidence-based and rationale-augmented calibrator\nmodels. Furthermore, these performance improvements correlate with higher\ndiversity of CF instances in terms of their surface form and semantic content.\nFinally, we show that CF augmented models which are easier to calibrate also\nexhibit much lower entropy when assigning importance, indicating that\nrationale-augmented calibrators prefer concise explanations.\n","authors":["Rachneet Sachdeva","Martin Tutek","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2309.07822v1.pdf","comment":"We make our code available at: https://github.com/UKPLab/CATfOOD"},{"id":"http://arxiv.org/abs/2309.07812v1","updated":"2023-09-14T15:59:16Z","published":"2023-09-14T15:59:16Z","title":"Text Classification of Cancer Clinical Trial Eligibility Criteria","summary":" Automatic identification of clinical trials for which a patient is eligible\nis complicated by the fact that trial eligibility is stated in natural\nlanguage. A potential solution to this problem is to employ text classification\nmethods for common types of eligibility criteria. In this study, we focus on\nseven common exclusion criteria in cancer trials: prior malignancy, human\nimmunodeficiency virus, hepatitis B, hepatitis C, psychiatric illness,\ndrug/substance abuse, and autoimmune illness. Our dataset consists of 764 phase\nIII cancer trials with these exclusions annotated at the trial level. We\nexperiment with common transformer models as well as a new pre-trained clinical\ntrial BERT model. Our results demonstrate the feasibility of automatically\nclassifying common exclusion criteria. Additionally, we demonstrate the value\nof a pre-trained language model specifically for clinical trials, which yields\nthe highest average performance across all criteria.\n","authors":["Yumeng Yang","Soumya Jayaraj","Ethan B Ludmir","Kirk Roberts"],"pdf_url":"https://arxiv.org/pdf/2309.07812v1.pdf","comment":"AMIA Annual Symposium Proceedings 2023"},{"id":"http://arxiv.org/abs/2309.07804v1","updated":"2023-09-14T15:46:41Z","published":"2023-09-14T15:46:41Z","title":"Pop Quiz! Do Pre-trained Code Models Possess Knowledge of Correct API\n Names?","summary":" Recent breakthroughs in pre-trained code models, such as CodeBERT and Codex,\nhave shown their superior performance in various downstream tasks. The\ncorrectness and unambiguity of API usage among these code models are crucial\nfor achieving desirable program functionalities, requiring them to learn\nvarious API fully qualified names structurally and semantically. Recent studies\nreveal that even state-of-the-art pre-trained code models struggle with\nsuggesting the correct APIs during code generation. However, the reasons for\nsuch poor API usage performance are barely investigated. To address this\nchallenge, we propose using knowledge probing as a means of interpreting code\nmodels, which uses cloze-style tests to measure the knowledge stored in models.\nOur comprehensive study examines a code model's capability of understanding API\nfully qualified names from two different perspectives: API call and API import.\nSpecifically, we reveal that current code models struggle with understanding\nAPI names, with pre-training strategies significantly affecting the quality of\nAPI name learning. We demonstrate that natural language context can assist code\nmodels in locating Python API names and generalize Python API name knowledge to\nunseen data. Our findings provide insights into the limitations and\ncapabilities of current pre-trained code models, and suggest that incorporating\nAPI structure into the pre-training process can improve automated API usage and\ncode representations. This work provides significance for advancing code\nintelligence practices and direction for future studies. All experiment\nresults, data and source code used in this work are available at\n\\url{https://doi.org/10.5281/zenodo.7902072}.\n","authors":["Terry Yue Zhuo","Xiaoning Du","Zhenchang Xing","Jiamou Sun","Haowei Quan","Li Li","Liming Zhu"],"pdf_url":"https://arxiv.org/pdf/2309.07804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07797v1","updated":"2023-09-14T15:36:10Z","published":"2023-09-14T15:36:10Z","title":"The Dynamical Principles of Storytelling","summary":" When considering the opening part of 1800 short stories, we find that the\nfirst dozen paragraphs of the average narrative follow an action principle as\ndefined in arXiv:2309.06600. When the order of the paragraphs is shuffled, the\naverage no longer exhibits this property. The findings show that there is a\npreferential direction we take in semantic space when starting a story,\npossibly related to a common Western storytelling tradition as implied by\nAristotle in Poetics.\n","authors":["Isidoros Doxas","James Meiss","Steven Bottone","Tom Strelich","Andrew Plummer","Adrienne Breland","Simon Dennis","Kathy Garvin-Doxas","Michael Klymkowsky"],"pdf_url":"https://arxiv.org/pdf/2309.07797v1.pdf","comment":"6 pages, 4 figures, 3 tables"},{"id":"http://arxiv.org/abs/2309.07794v1","updated":"2023-09-14T15:30:59Z","published":"2023-09-14T15:30:59Z","title":"Improving Multimodal Classification of Social Media Posts by Leveraging\n Image-Text Auxiliary tasks","summary":" Effectively leveraging multimodal information from social media posts is\nessential to various downstream tasks such as sentiment analysis, sarcasm\ndetection and hate speech classification. However, combining text and image\ninformation is challenging because of the idiosyncratic cross-modal semantics\nwith hidden or complementary information present in matching image-text pairs.\nIn this work, we aim to directly model this by proposing the use of two\nauxiliary losses jointly with the main task when fine-tuning any pre-trained\nmultimodal model. Image-Text Contrastive (ITC) brings image-text\nrepresentations of a post closer together and separates them from different\nposts, capturing underlying dependencies. Image-Text Matching (ITM) facilitates\nthe understanding of semantic correspondence between images and text by\npenalizing unrelated pairs. We combine these objectives with five multimodal\nmodels, demonstrating consistent improvements across four popular social media\ndatasets. Furthermore, through detailed analysis, we shed light on the specific\nscenarios and cases where each auxiliary task proves to be most effective.\n","authors":["Danae Sánchez Villegas","Daniel Preoţiuc-Pietro","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2309.07794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07773v1","updated":"2023-09-14T15:02:05Z","published":"2023-09-14T15:02:05Z","title":"Usability Evaluation of Spoken Humanoid Embodied Conversational Agents\n in Mobile Serious Games","summary":" This paper presents an empirical investigation of the extent to which spoken\nHumanoid Embodied Conversational Agents (HECAs) can foster usability in mobile\nserious game (MSG) applications. The aim of the research is to assess the\nimpact of multiple agents and illusion of humanness on the quality of the\ninteraction. The experiment investigates two styles of agent presentation: an\nagent of high human-likeness (HECA) and an agent of low human-likeness (text).\nThe purpose of the experiment is to assess whether and how agents of high\nhumanlikeness can evoke the illusion of humanness and affect usability. Agents\nof high human-likeness were designed by following the ECA design model that is\na proposed guide for ECA development. The results of the experiment with 90\nparticipants show that users prefer to interact with the HECAs. The difference\nbetween the two versions is statistically significant with a large effect size\n(d=1.01), with many of the participants justifying their choice by saying that\nthe human-like characteristics of the HECA made the version more appealing.\nThis research provides key information on the potential effect of HECAs on\nserious games, which can provide insight into the design of future mobile\nserious games.\n","authors":["Danai Korre","Judy Robertson"],"pdf_url":"https://arxiv.org/pdf/2309.07773v1.pdf","comment":"45 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2309.07765v1","updated":"2023-09-14T14:51:51Z","published":"2023-09-14T14:51:51Z","title":"Echotune: A Modular Extractor Leveraging the Variable-Length Nature of\n Speech in ASR Tasks","summary":" The Transformer architecture has proven to be highly effective for Automatic\nSpeech Recognition (ASR) tasks, becoming a foundational component for a\nplethora of research in the domain. Historically, many approaches have leaned\non fixed-length attention windows, which becomes problematic for varied speech\nsamples in duration and complexity, leading to data over-smoothing and neglect\nof essential long-term connectivity. Addressing this limitation, we introduce\nEcho-MSA, a nimble module equipped with a variable-length attention mechanism\nthat accommodates a range of speech sample complexities and durations. This\nmodule offers the flexibility to extract speech features across various\ngranularities, spanning from frames and phonemes to words and discourse. The\nproposed design captures the variable length feature of speech and addresses\nthe limitations of fixed-length attention. Our evaluation leverages a parallel\nattention architecture complemented by a dynamic gating mechanism that\namalgamates traditional attention with the Echo-MSA module output. Empirical\nevidence from our study reveals that integrating Echo-MSA into the primary\nmodel's training regime significantly enhances the word error rate (WER)\nperformance, all while preserving the intrinsic stability of the original\nmodel.\n","authors":["Sizhou Chen","Songyang Gao","Sen Fang"],"pdf_url":"https://arxiv.org/pdf/2309.07765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07759v1","updated":"2023-09-14T14:45:47Z","published":"2023-09-14T14:45:47Z","title":"PROGrasp: Pragmatic Human-Robot Communication for Object Grasping","summary":" Interactive Object Grasping (IOG) is the task of identifying and grasping the\ndesired object via human-robot natural language interaction. Current IOG\nsystems assume that a human user initially specifies the target object's\ncategory (e.g., bottle). Inspired by pragmatics, where humans often convey\ntheir intentions by relying on context to achieve goals, we introduce a new IOG\ntask, Pragmatic-IOG, and the corresponding dataset, Intention-oriented\nMulti-modal Dialogue (IM-Dial). In our proposed task scenario, an\nintention-oriented utterance (e.g., \"I am thirsty\") is initially given to the\nrobot. The robot should then identify the target object by interacting with a\nhuman user. Based on the task setup, we propose a new robotic system that can\ninterpret the user's intention and pick up the target object, Pragmatic Object\nGrasping (PROGrasp). PROGrasp performs Pragmatic-IOG by incorporating modules\nfor visual grounding, question asking, object grasping, and most importantly,\nanswer interpretation for pragmatic inference. Experimental results show that\nPROGrasp is effective in offline (i.e., target object discovery) and online\n(i.e., IOG with a physical robot arm) settings.\n","authors":["Gi-Cheon Kang","Junghyun Kim","Jaein Kim","Byoung-Tak Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07759v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2306.14565v2","updated":"2023-09-14T14:41:52Z","published":"2023-06-26T10:26:33Z","title":"Mitigating Hallucination in Large Multi-Modal Models via Robust\n Instruction Tuning","summary":" Despite the promising progress in multi-modal tasks, current large\nmulti-modal models (LMM) are prone to hallucinating inconsistent descriptions\nwith respect to the associated image and human instructions. This paper\naddresses this issue by introducing the first large and diverse visual\ninstruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction.\nOur dataset consists of 120k visual instructions generated by GPT4, covering 16\nvision-and-language tasks with open-ended instructions and answers. Unlike\nexisting studies that primarily focus on positive instruction samples, we\ndesign LRV-Instruction to include both positive and negative instructions for\nmore robust visual instruction tuning. Our negative instructions are designed\nat two semantic levels: (i) Nonexistent Element Manipulation and (ii) Existent\nElement Manipulation. To efficiently measure the hallucination generated by\nLMMs, we propose GPT4-Assisted Visual Instruction Evaluation (GAVIE), a novel\napproach to evaluate visual instruction tuning without the need for\nhuman-annotated groundtruth answers and can adapt to diverse instruction\nformats. We conduct comprehensive experiments to investigate the hallucination\nof LMMs. Our results demonstrate that existing LMMs exhibit significant\nhallucination when presented with our negative instructions, particularly with\nExistent Element Manipulation instructions. Moreover, by finetuning MiniGPT4 on\nLRV-Instruction, we successfully mitigate hallucination while improving\nperformance on public datasets using less training data compared to\nstate-of-the-art methods. Additionally, we observed that a balanced ratio of\npositive and negative instances in the training data leads to a more robust\nmodel. Updates of our project are available at\nhttps://fuxiaoliu.github.io/LRV/.\n","authors":["Fuxiao Liu","Kevin Lin","Linjie Li","Jianfeng Wang","Yaser Yacoob","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14565v2.pdf","comment":"35 pages, 27 figures. Under Review"},{"id":"http://arxiv.org/abs/2309.07755v1","updated":"2023-09-14T14:41:46Z","published":"2023-09-14T14:41:46Z","title":"Generative AI Text Classification using Ensemble LLM Approaches","summary":" Large Language Models (LLMs) have shown impressive performance across a\nvariety of Artificial Intelligence (AI) and natural language processing tasks,\nsuch as content creation, report generation, etc. However, unregulated malign\napplication of these models can create undesirable consequences such as\ngeneration of fake news, plagiarism, etc. As a result, accurate detection of\nAI-generated language can be crucial in responsible usage of LLMs. In this\nwork, we explore 1) whether a certain body of text is AI generated or written\nby human, and 2) attribution of a specific language model in generating a body\nof text. Texts in both English and Spanish are considered. The datasets used in\nthis study are provided as part of the Automated Text Identification\n(AuTexTification) shared task. For each of the research objectives stated\nabove, we propose an ensemble neural model that generates probabilities from\ndifferent pre-trained LLMs which are used as features to a Traditional Machine\nLearning (TML) classifier following it. For the first task of distinguishing\nbetween AI and human generated text, our model ranked in fifth and thirteenth\nplace (with macro $F1$ scores of 0.733 and 0.649) for English and Spanish\ntexts, respectively. For the second task on model attribution, our model ranked\nin first place with macro $F1$ scores of 0.625 and 0.653 for English and\nSpanish texts, respectively.\n","authors":["Harika Abburi","Michael Suesserman","Nirmala Pudota","Balaji Veeramani","Edward Bowen","Sanmitra Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2309.07755v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00917v2","updated":"2023-09-14T14:25:37Z","published":"2023-09-02T11:46:41Z","title":"Knowledge Graph Embeddings for Multi-Lingual Structured Representations\n of Radiology Reports","summary":" The way we analyse clinical texts has undergone major changes over the last\nyears. The introduction of language models such as BERT led to adaptations for\nthe (bio)medical domain like PubMedBERT and ClinicalBERT. These models rely on\nlarge databases of archived medical documents. While performing well in terms\nof accuracy, both the lack of interpretability and limitations to transfer\nacross languages limit their use in clinical setting. We introduce a novel\nlight-weight graph-based embedding method specifically catering radiology\nreports. It takes into account the structure and composition of the report,\nwhile also connecting medical terms in the report through the multi-lingual\nSNOMED Clinical Terms knowledge base. The resulting graph embedding uncovers\nthe underlying relationships among clinical terms, achieving a representation\nthat is better understandable for clinicians and clinically more accurate,\nwithout reliance on large pre-training datasets. We show the use of this\nembedding on two tasks namely disease classification of X-ray reports and image\nclassification. For disease classification our model is competitive with its\nBERT-based counterparts, while being magnitudes smaller in size and training\ndata requirements. For image classification, we show the effectiveness of the\ngraph embedding leveraging cross-modal knowledge transfer and show how this\nmethod is usable across different languages.\n","authors":["Tom van Sonsbeek","Xiantong Zhen","Marcel Worring"],"pdf_url":"https://arxiv.org/pdf/2309.00917v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07739v1","updated":"2023-09-14T14:18:07Z","published":"2023-09-14T14:18:07Z","title":"The complementary roles of non-verbal cues for Robust Pronunciation\n Assessment","summary":" Research on pronunciation assessment systems focuses on utilizing phonetic\nand phonological aspects of non-native (L2) speech, often neglecting the rich\nlayer of information hidden within the non-verbal cues. In this study, we\nproposed a novel pronunciation assessment framework, IntraVerbalPA. % The\nframework innovatively incorporates both fine-grained frame- and abstract\nutterance-level non-verbal cues, alongside the conventional speech and phoneme\nrepresentations. Additionally, we introduce ''Goodness of phonemic-duration''\nmetric to effectively model duration distribution within the framework. Our\nresults validate the effectiveness of the proposed IntraVerbalPA framework and\nits individual components, yielding performance that either matches or\noutperforms existing research works.\n","authors":["Yassine El Kheir","Shammur Absar Chowdhury","Ahmed Ali"],"pdf_url":"https://arxiv.org/pdf/2309.07739v1.pdf","comment":"5 pages, submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.07733v1","updated":"2023-09-14T14:12:34Z","published":"2023-09-14T14:12:34Z","title":"Explaining Speech Classification Models via Word-Level Audio Segments\n and Paralinguistic Features","summary":" Recent advances in eXplainable AI (XAI) have provided new insights into how\nmodels for vision, language, and tabular data operate. However, few approaches\nexist for understanding speech models. Existing work focuses on a few spoken\nlanguage understanding (SLU) tasks, and explanations are difficult to interpret\nfor most users. We introduce a new approach to explain speech classification\nmodels. We generate easy-to-interpret explanations via input perturbation on\ntwo information levels. 1) Word-level explanations reveal how each word-related\naudio segment impacts the outcome. 2) Paralinguistic features (e.g., prosody\nand background noise) answer the counterfactual: ``What would the model\nprediction be if we edited the audio signal in this way?'' We validate our\napproach by explaining two state-of-the-art SLU models on two speech\nclassification tasks in English and Italian. Our findings demonstrate that the\nexplanations are faithful to the model's inner workings and plausible to\nhumans. Our method and findings pave the way for future research on\ninterpreting speech models.\n","authors":["Eliana Pastor","Alkis Koudounas","Giuseppe Attanasio","Dirk Hovy","Elena Baralis"],"pdf_url":"https://arxiv.org/pdf/2309.07733v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2309.07727v1","updated":"2023-09-14T14:03:48Z","published":"2023-09-14T14:03:48Z","title":"PerPLM: Personalized Fine-tuning of Pretrained Language Models via\n Writer-specific Intermediate Learning and Prompts","summary":" The meanings of words and phrases depend not only on where they are used\n(contexts) but also on who use them (writers). Pretrained language models\n(PLMs) are powerful tools for capturing context, but they are typically\npretrained and fine-tuned for universal use across different writers. This\nstudy aims to improve the accuracy of text understanding tasks by personalizing\nthe fine-tuning of PLMs for specific writers. We focus on a general setting\nwhere only the plain text from target writers are available for\npersonalization. To avoid the cost of fine-tuning and storing multiple copies\nof PLMs for different users, we exhaustively explore using writer-specific\nprompts to personalize a unified PLM. Since the design and evaluation of these\nprompts is an underdeveloped area, we introduce and compare different types of\nprompts that are possible in our setting. To maximize the potential of\nprompt-based personalized fine-tuning, we propose a personalized intermediate\nlearning based on masked language modeling to extract task-independent traits\nof writers' text. Our experiments, using multiple tasks, datasets, and PLMs,\nreveal the nature of different prompts and the effectiveness of our\nintermediate learning approach.\n","authors":["Daisuke Oba","Naoki Yoshinaga","Masashi Toyoda"],"pdf_url":"https://arxiv.org/pdf/2309.07727v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2305.14171v2","updated":"2023-09-14T13:58:01Z","published":"2023-05-23T15:43:04Z","title":"Probing in Context: Toward Building Robust Classifiers via Probing Large\n Language Models","summary":" Large language models are able to learn new tasks in context, where they are\nprovided with instructions and a few annotated examples. However, the\neffectiveness of in-context learning is dependent on the provided context, and\nthe performance on a downstream task can vary considerably, depending on the\ninstruction. Importantly, such dependency on the context can surface in\nunpredictable ways, e.g., a seemingly more informative instruction might lead\nto a worse performance. In this paper, we propose an alternative approach,\nwhich we term in-context probing. Similar to in-context learning, we\ncontextualize the representation of the input with an instruction, but instead\nof decoding the output prediction, we probe the contextualized representation\nto predict the label. Through a series of experiments on a diverse set of\nclassification tasks, we show that in-context probing is significantly more\nrobust to changes in instructions. We further show that probing performs\ncompetitive or superior to finetuning and can be particularly helpful to build\nclassifiers on top of smaller models, and with only a hundred training\nexamples.\n","authors":["Afra Amini","Massimiliano Ciaramita"],"pdf_url":"https://arxiv.org/pdf/2305.14171v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07719v1","updated":"2023-09-14T13:53:17Z","published":"2023-09-14T13:53:17Z","title":"L1-aware Multilingual Mispronunciation Detection Framework","summary":" The phonological discrepancies between a speaker's native (L1) and the\nnon-native language (L2) serves as a major factor for mispronunciation. This\npaper introduces a novel multilingual MDD architecture, L1-MultiMDD, enriched\nwith L1-aware speech representation. An end-to-end speech encoder is trained on\nthe input signal and its corresponding reference phoneme sequence. First, an\nattention mechanism is deployed to align the input audio with the reference\nphoneme sequence. Afterwards, the L1-L2-speech embedding are extracted from an\nauxiliary model, pretrained in a multi-task setup identifying L1 and L2\nlanguage, and are infused with the primary network. Finally, the L1-MultiMDD is\nthen optimized for a unified multilingual phoneme recognition task using\nconnectionist temporal classification (CTC) loss for the target languages:\nEnglish, Arabic, and Mandarin. Our experiments demonstrate the effectiveness of\nthe proposed L1-MultiMDD framework on both seen -- L2-ARTIC, LATIC, and\nAraVoiceL2v2; and unseen -- EpaDB and Speechocean762 datasets. The consistent\ngains in PER, and false rejection rate (FRR) across all target languages\nconfirm our approach's robustness, efficacy, and generalizability.\n","authors":["Yassine El Kheir","Shammur Absar Chwodhury","Ahmed Ali"],"pdf_url":"https://arxiv.org/pdf/2309.07719v1.pdf","comment":"5 papers, submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.07707v1","updated":"2023-09-14T13:38:02Z","published":"2023-09-14T13:38:02Z","title":"CoLLD: Contrastive Layer-to-layer Distillation for Compressing\n Multilingual Pre-trained Speech Encoders","summary":" Large-scale self-supervised pre-trained speech encoders outperform\nconventional approaches in speech recognition and translation tasks. Due to the\nhigh cost of developing these large models, building new encoders for new tasks\nand deploying them to on-device applications are infeasible. Prior studies\npropose model compression methods to address this issue, but those works focus\non smaller models and less realistic tasks. Thus, we propose Contrastive\nLayer-to-layer Distillation (CoLLD), a novel knowledge distillation method to\ncompress pre-trained speech encoders by leveraging masked prediction and\ncontrastive learning to train student models to copy the behavior of a large\nteacher model. CoLLD outperforms prior methods and closes the gap between small\nand large models on multilingual speech-to-text translation and recognition\nbenchmarks.\n","authors":["Heng-Jui Chang","Ning Dong","Ruslan Mavlyutov","Sravya Popuri","Yu-An Chung"],"pdf_url":"https://arxiv.org/pdf/2309.07707v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.07694v1","updated":"2023-09-14T13:14:51Z","published":"2023-09-14T13:14:51Z","title":"Tree of Uncertain Thoughts Reasoning for Large Language Models","summary":" While the recently introduced Tree of Thoughts (ToT) has heralded\nadvancements in allowing Large Language Models (LLMs) to reason through\nforesight and backtracking for global decision-making, it has overlooked the\ninherent local uncertainties in intermediate decision points or \"thoughts\".\nThese local uncertainties, intrinsic to LLMs given their potential for diverse\nresponses, remain a significant concern in the reasoning process. Addressing\nthis pivotal gap, we introduce the Tree of Uncertain Thoughts (TouT) - a\nreasoning framework tailored for LLMs. Our TouT effectively leverages Monte\nCarlo Dropout to quantify uncertainty scores associated with LLMs' diverse\nlocal responses at these intermediate steps. By marrying this local uncertainty\nquantification with global search algorithms, TouT enhances the model's\nprecision in response generation. We substantiate our approach with rigorous\nexperiments on two demanding planning tasks: Game of 24 and Mini Crosswords.\nThe empirical evidence underscores TouT's superiority over both ToT and\nchain-of-thought prompting methods.\n","authors":["Shentong Mo","Miao Xin"],"pdf_url":"https://arxiv.org/pdf/2309.07694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07689v1","updated":"2023-09-14T13:05:20Z","published":"2023-09-14T13:05:20Z","title":"Detecting ChatGPT: A Survey of the State of Detecting ChatGPT-Generated\n Text","summary":" While recent advancements in the capabilities and widespread accessibility of\ngenerative language models, such as ChatGPT (OpenAI, 2022), have brought about\nvarious benefits by generating fluent human-like text, the task of\ndistinguishing between human- and large language model (LLM) generated text has\nemerged as a crucial problem. These models can potentially deceive by\ngenerating artificial text that appears to be human-generated. This issue is\nparticularly significant in domains such as law, education, and science, where\nensuring the integrity of text is of the utmost importance. This survey\nprovides an overview of the current approaches employed to differentiate\nbetween texts generated by humans and ChatGPT. We present an account of the\ndifferent datasets constructed for detecting ChatGPT-generated text, the\nvarious methods utilized, what qualitative analyses into the characteristics of\nhuman versus ChatGPT-generated text have been performed, and finally, summarize\nour findings into general insights\n","authors":["Mahdi Dhaini","Wessel Poelman","Ege Erdogan"],"pdf_url":"https://arxiv.org/pdf/2309.07689v1.pdf","comment":"Published in the Proceedings of the Student Research Workshop\n associated with RANLP-2023"},{"id":"http://arxiv.org/abs/2309.05918v3","updated":"2023-09-14T12:58:39Z","published":"2023-09-12T02:14:05Z","title":"Stochastic LLMs do not Understand Language: Towards Symbolic,\n Explainable and Ontologically Based LLMs","summary":" In our opinion the exuberance surrounding the relative success of data-driven\nlarge language models (LLMs) is slightly misguided and for several reasons (i)\nLLMs cannot be relied upon for factual information since for LLMs all ingested\ntext (factual or non-factual) was created equal; (ii) due to their subsymbolic\nna-ture, whatever 'knowledge' these models acquire about language will always\nbe buried in billions of microfeatures (weights), none of which is meaningful\non its own; and (iii) LLMs will often fail to make the correct inferences in\nseveral linguistic contexts (e.g., nominal compounds, copredication, quantifier\nscope ambi-guities, intensional contexts. Since we believe the relative success\nof data-driven large language models (LLMs) is not a reflection on the symbolic\nvs. subsymbol-ic debate but a reflection on applying the successful strategy of\na bottom-up reverse engineering of language at scale, we suggest in this paper\napplying the effective bottom-up strategy in a symbolic setting resulting in\nsymbolic, explainable, and ontologically grounded language models.\n","authors":["Walid S. Saba"],"pdf_url":"https://arxiv.org/pdf/2309.05918v3.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2309.07683v1","updated":"2023-09-14T12:58:30Z","published":"2023-09-14T12:58:30Z","title":"Assessing the nature of large language models: A caution against\n anthropocentrism","summary":" Generative AI models garnered a large amount of public attention and\nspeculation with the release of OpenAIs chatbot, ChatGPT. At least two opinion\ncamps exist: one excited about possibilities these models offer for fundamental\nchanges to human tasks, and another highly concerned about power these models\nseem to have. To address these concerns, we assessed GPT3.5 using standard,\nnormed, and validated cognitive and personality measures. For this seedling\nproject, we developed a battery of tests that allowed us to estimate the\nboundaries of some of these models capabilities, how stable those capabilities\nare over a short period of time, and how they compare to humans.\n Our results indicate that GPT 3.5 is unlikely to have developed sentience,\nalthough its ability to respond to personality inventories is interesting. It\ndid display large variability in both cognitive and personality measures over\nrepeated observations, which is not expected if it had a human-like\npersonality. Variability notwithstanding, GPT3.5 displays what in a human would\nbe considered poor mental health, including low self-esteem and marked\ndissociation from reality despite upbeat and helpful responses.\n","authors":["Ann Speed"],"pdf_url":"https://arxiv.org/pdf/2309.07683v1.pdf","comment":"30 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.07682v1","updated":"2023-09-14T12:55:23Z","published":"2023-09-14T12:55:23Z","title":"A Conversation is Worth A Thousand Recommendations: A Survey of Holistic\n Conversational Recommender Systems","summary":" Conversational recommender systems (CRS) generate recommendations through an\ninteractive process. However, not all CRS approaches use human conversations as\ntheir source of interaction data; the majority of prior CRS work simulates\ninteractions by exchanging entity-level information. As a result, claims of\nprior CRS work do not generalise to real-world settings where conversations\ntake unexpected turns, or where conversational and intent understanding is not\nperfect. To tackle this challenge, the research community has started to\nexamine holistic CRS, which are trained using conversational data collected\nfrom real-world scenarios. Despite their emergence, such holistic approaches\nare under-explored.\n We present a comprehensive survey of holistic CRS methods by summarizing the\nliterature in a structured manner. Our survey recognises holistic CRS\napproaches as having three components: 1) a backbone language model, the\noptional use of 2) external knowledge, and/or 3) external guidance. We also\ngive a detailed analysis of CRS datasets and evaluation methods in real\napplication scenarios. We offer our insight as to the current challenges of\nholistic CRS and possible future trends.\n","authors":["Chuang Li","Hengchang Hu","Yan Zhang","Min-Yen Kan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2309.07682v1.pdf","comment":"Accepted by 5th KaRS Workshop @ ACM RecSys 2023, 8 pages"},{"id":"http://arxiv.org/abs/2309.07677v1","updated":"2023-09-14T12:43:26Z","published":"2023-09-14T12:43:26Z","title":"Aligning Speakers: Evaluating and Visualizing Text-based Diarization\n Using Efficient Multiple Sequence Alignment (Extended Version)","summary":" This paper presents a novel evaluation approach to text-based speaker\ndiarization (SD), tackling the limitations of traditional metrics that do not\naccount for any contextual information in text. Two new metrics are proposed,\nText-based Diarization Error Rate and Diarization F1, which perform utterance-\nand word-level evaluations by aligning tokens in reference and hypothesis\ntranscripts. Our metrics encompass more types of errors compared to existing\nones, allowing us to make a more comprehensive analysis in SD. To align tokens,\na multiple sequence alignment algorithm is introduced that supports multiple\nsequences in the reference while handling high-dimensional alignment to the\nhypothesis using dynamic programming. Our work is packaged into two tools,\nalign4d providing an API for our alignment algorithm and TranscribeView for\nvisualizing and evaluating SD errors, which can greatly aid in the creation of\nhigh-quality data, fostering the advancement of dialogue systems.\n","authors":["Chen Gong","Peilin Wu","Jinho D. Choi"],"pdf_url":"https://arxiv.org/pdf/2309.07677v1.pdf","comment":"Accepted to the 35th IEEE International Conference on Tools with\n Artificial Intelligence (ICTAI) 2023"},{"id":"http://arxiv.org/abs/2309.07650v1","updated":"2023-09-14T12:16:21Z","published":"2023-09-14T12:16:21Z","title":"Automatic Data Visualization Generation from Chinese Natural Language\n Questions","summary":" Data visualization has emerged as an effective tool for getting insights from\nmassive datasets. Due to the hardness of manipulating the programming languages\nof data visualization, automatic data visualization generation from natural\nlanguages (Text-to-Vis) is becoming increasingly popular. Despite the plethora\nof research effort on the English Text-to-Vis, studies have yet to be conducted\non data visualization generation from questions in Chinese. Motivated by this,\nwe propose a Chinese Text-to-Vis dataset in the paper and demonstrate our first\nattempt to tackle this problem. Our model integrates multilingual BERT as the\nencoder, boosts the cross-lingual ability, and infuses the $n$-gram information\ninto our word representation learning. Our experimental results show that our\ndataset is challenging and deserves further research.\n","authors":["Yan Ge","Victor Junqiu Wei","Yuanfeng Song","Jason Chen Zhang","Raymond Chi-Wing Wong"],"pdf_url":"https://arxiv.org/pdf/2309.07650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07648v1","updated":"2023-09-14T12:14:49Z","published":"2023-09-14T12:14:49Z","title":"Incorporating Class-based Language Model for Named Entity Recognition in\n Factorized Neural Transducer","summary":" In spite of the excellent strides made by end-to-end (E2E) models in speech\nrecognition in recent years, named entity recognition is still challenging but\ncritical for semantic understanding. In order to enhance the ability to\nrecognize named entities in E2E models, previous studies mainly focus on\nvarious rule-based or attention-based contextual biasing algorithms. However,\ntheir performance might be sensitive to the biasing weight or degraded by\nexcessive attention to the named entity list, along with a risk of false\ntriggering. Inspired by the success of the class-based language model (LM) in\nnamed entity recognition in conventional hybrid systems and the effective\ndecoupling of acoustic and linguistic information in the factorized neural\nTransducer (FNT), we propose a novel E2E model to incorporate class-based LMs\ninto FNT, which is referred as C-FNT. In C-FNT, the language model score of\nnamed entities can be associated with the name class instead of its surface\nform. The experimental results show that our proposed C-FNT presents\nsignificant error reduction in named entities without hurting performance in\ngeneral word recognition.\n","authors":["Peng Wang","Yifan Yang","Zheng Liang","Tian Tan","Shiliang Zhang","Xie Chen"],"pdf_url":"https://arxiv.org/pdf/2309.07648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.16143v3","updated":"2023-09-14T11:41:40Z","published":"2023-06-28T12:17:45Z","title":"Generative User-Experience Research for Developing Domain-specific\n Natural Language Processing Applications","summary":" User experience (UX) is a part of human-computer interaction (HCI) research\nand focuses on increasing intuitiveness, transparency, simplicity, and trust\nfor system users. Most of the UX research for machine learning (ML) or natural\nlanguage processing (NLP) focuses on a data-driven methodology, i.e., it fails\nto focus on users' requirements, and engages domain users mainly for usability\nevaluation. Moreover, more typical UX methods tailor the systems towards user\nusability, unlike learning about the user needs first. The paper proposes a\nmethodology for integrating generative UX research into developing domain NLP\napplications. Generative UX research employs domain users at the initial stages\nof prototype development, i.e., ideation and concept evaluation, and the last\nstage for evaluating the change in user value. In the case study, we report the\nfull-cycle prototype development of a domain-specific semantic search for daily\noperations in the process industry. Our case study shows that involving domain\nexperts increases their interest and trust in the final NLP application.\nMoreover, we show that synergetic UX+NLP research efficiently considers data-\nand user-driven opportunities and constraints, which can be crucial for NLP\napplications in narrow domains\n","authors":["Anastasia Zhukova","Lukas von Sperl","Christian E. Matt","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2306.16143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07624v1","updated":"2023-09-14T11:40:30Z","published":"2023-09-14T11:40:30Z","title":"Dynamic MOdularized Reasoning for Compositional Structured Explanation\n Generation","summary":" Despite the success of neural models in solving reasoning tasks, their\ncompositional generalization capabilities remain unclear. In this work, we\npropose a new setting of the structured explanation generation task to\nfacilitate compositional reasoning research. Previous works found that symbolic\nmethods achieve superior compositionality by using pre-defined inference rules\nfor iterative reasoning. But these approaches rely on brittle symbolic\ntransfers and are restricted to well-defined tasks. Hence, we propose a dynamic\nmodularized reasoning model, MORSE, to improve the compositional generalization\nof neural models. MORSE factorizes the inference process into a combination of\nmodules, where each module represents a functional unit. Specifically, we adopt\nmodularized self-attention to dynamically select and route inputs to dedicated\nheads, which specializes them to specific functions. We conduct experiments for\nincreasing lengths and shapes of reasoning trees on two benchmarks to test\nMORSE's compositional generalization abilities, and find it outperforms\ncompetitive baselines. Model ablation and deeper analyses show the\neffectiveness of dynamic reasoning modules and their generalization abilities.\n","authors":["Xiyan Fu","Anette Frank"],"pdf_url":"https://arxiv.org/pdf/2309.07624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07606v1","updated":"2023-09-14T11:13:36Z","published":"2023-09-14T11:13:36Z","title":"Zero-shot Audio Topic Reranking using Large Language Models","summary":" The Multimodal Video Search by Examples (MVSE) project investigates using\nvideo clips as the query term for information retrieval, rather than the more\ntraditional text query. This enables far richer search modalities such as\nimages, speaker, content, topic, and emotion. A key element for this process is\nhighly rapid, flexible, search to support large archives, which in MVSE is\nfacilitated by representing video attributes by embeddings. This work aims to\nmitigate any performance loss from this rapid archive search by examining\nreranking approaches. In particular, zero-shot reranking methods using large\nlanguage models are investigated as these are applicable to any video archive\naudio content. Performance is evaluated for topic-based retrieval on a publicly\navailable video archive, the BBC Rewind corpus. Results demonstrate that\nreranking can achieve improved retrieval ranking without the need for any\ntask-specific training data.\n","authors":["Mengjie Qian","Rao Ma","Adian Liusie","Erfan Loweimi","Kate M. Knill","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2309.07606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07601v1","updated":"2023-09-14T11:06:51Z","published":"2023-09-14T11:06:51Z","title":"Detecting Misinformation with LLM-Predicted Credibility Signals and Weak\n Supervision","summary":" Credibility signals represent a wide range of heuristics that are typically\nused by journalists and fact-checkers to assess the veracity of online content.\nAutomating the task of credibility signal extraction, however, is very\nchallenging as it requires high-accuracy signal-specific extractors to be\ntrained, while there are currently no sufficiently large datasets annotated\nwith all credibility signals. This paper investigates whether large language\nmodels (LLMs) can be prompted effectively with a set of 18 credibility signals\nto produce weak labels for each signal. We then aggregate these potentially\nnoisy labels using weak supervision in order to predict content veracity. We\ndemonstrate that our approach, which combines zero-shot LLM credibility signal\nlabeling and weak supervision, outperforms state-of-the-art classifiers on two\nmisinformation datasets without using any ground-truth labels for training. We\nalso analyse the contribution of the individual credibility signals towards\npredicting content veracity, which provides new valuable insights into their\nrole in misinformation detection.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2309.07601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07597v1","updated":"2023-09-14T10:57:50Z","published":"2023-09-14T10:57:50Z","title":"C-Pack: Packaged Resources To Advance General Chinese Embedding","summary":" We introduce C-Pack, a package of resources that significantly advance the\nfield of general Chinese embeddings. C-Pack includes three critical resources.\n1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6\ntasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated\nfrom labeled and unlabeled Chinese corpora for training embedding models. 3)\nC-TEM is a family of embedding models covering multiple sizes. Our models\noutperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the\ntime of the release. We also integrate and optimize the entire suite of\ntraining methods for C-TEM. Along with our resources on general Chinese\nembedding, we release our data and models for English text embeddings. The\nEnglish models achieve state-of-the-art performance on MTEB benchmark;\nmeanwhile, our released English data is 2 times larger than the Chinese data.\nAll these resources are made publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Niklas Muennighof"],"pdf_url":"https://arxiv.org/pdf/2309.07597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07590v1","updated":"2023-09-14T10:49:16Z","published":"2023-09-14T10:49:16Z","title":"Revisiting Supertagging for HPSG","summary":" We present new supertaggers trained on HPSG-based treebanks. These treebanks\nfeature high-quality annotation based on a well-developed linguistic theory and\ninclude diverse and challenging test datasets, beyond the usual WSJ section 23\nand Wikipedia data. HPSG supertagging has previously relied on MaxEnt-based\nmodels. We use SVM and neural CRF- and BERT-based methods and show that both\nSVM and neural supertaggers achieve considerably higher accuracy compared to\nthe baseline. Our fine-tuned BERT-based tagger achieves 97.26% accuracy on 1000\nsentences from WSJ23 and 93.88% on the completely out-of-domain The Cathedral\nand the Bazaar (cb)). We conclude that it therefore makes sense to integrate\nthese new supertaggers into modern HPSG parsers, and we also hope that the\ndiverse and difficult datasets we used here will gain more popularity in the\nfield. We contribute the complete dataset reformatted for token classification.\n","authors":["Olga Zamaraeva","Carlos Gómez-Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2309.07590v1.pdf","comment":"9 pages, 0 figures"},{"id":"http://arxiv.org/abs/2309.07561v1","updated":"2023-09-14T09:44:46Z","published":"2023-09-14T09:44:46Z","title":"Adaptive Prompt Learning with Distilled Connective Knowledge for\n Implicit Discourse Relation Recognition","summary":" Implicit discourse relation recognition (IDRR) aims at recognizing the\ndiscourse relation between two text segments without an explicit connective.\nRecently, the prompt learning has just been applied to the IDRR task with great\nperformance improvements over various neural network-based approaches. However,\nthe discrete nature of the state-art-of-art prompting approach requires manual\ndesign of templates and answers, a big hurdle for its practical applications.\nIn this paper, we propose a continuous version of prompt learning together with\nconnective knowledge distillation, called AdaptPrompt, to reduce manual design\nefforts via continuous prompting while further improving performance via\nknowledge transfer. In particular, we design and train a few virtual tokens to\nform continuous templates and automatically select the most suitable one by\ngradient search in the embedding space. We also design an answer-relation\nmapping rule to generate a few virtual answers as the answer space.\nFurthermore, we notice the importance of annotated connectives in the training\ndataset and design a teacher-student architecture for knowledge transfer.\nExperiments on the up-to-date PDTB Corpus V3.0 validate our design objectives\nin terms of the better relation recognition performance over the\nstate-of-the-art competitors.\n","authors":["Bang Wang","Zhenglin Wang","Wei Xiang","Yijun Mo"],"pdf_url":"https://arxiv.org/pdf/2309.07561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07545v1","updated":"2023-09-14T09:15:36Z","published":"2023-09-14T09:15:36Z","title":"DBLPLink: An Entity Linker for the DBLP Scholarly Knowledge Graph","summary":" In this work, we present a web application named DBLPLink, which performs\nentity linking over the DBLP scholarly knowledge graph. DBLPLink uses\ntext-to-text pre-trained language models, such as T5, to produce entity label\nspans from an input text question. Entity candidates are fetched from a\ndatabase based on the labels, and an entity re-ranker sorts them based on\nentity embeddings, such as TransE, DistMult and ComplEx. The results are\ndisplayed so that users may compare and contrast the results between T5-small,\nT5-base and the different KG embeddings used. The demo can be accessed at\nhttps://ltdemos.informatik.uni-hamburg.de/dblplink/.\n","authors":["Debayan Banerjee"," Arefa","Ricardo Usbeck","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2309.07545v1.pdf","comment":"Accepted at International Semantic Web Conference (ISWC) 2023 Posters\n & Demo Track"},{"id":"http://arxiv.org/abs/2204.12793v3","updated":"2023-09-14T08:50:25Z","published":"2022-04-27T09:26:59Z","title":"Modern Baselines for SPARQL Semantic Parsing","summary":" In this work, we focus on the task of generating SPARQL queries from natural\nlanguage questions, which can then be executed on Knowledge Graphs (KGs). We\nassume that gold entity and relations have been provided, and the remaining\ntask is to arrange them in the right order along with SPARQL vocabulary, and\ninput tokens to produce the correct SPARQL query. Pre-trained Language Models\n(PLMs) have not been explored in depth on this task so far, so we experiment\nwith BART, T5 and PGNs (Pointer Generator Networks) with BERT embeddings,\nlooking for new baselines in the PLM era for this task, on DBpedia and Wikidata\nKGs. We show that T5 requires special input tokenisation, but produces state of\nthe art performance on LC-QuAD 1.0 and LC-QuAD 2.0 datasets, and outperforms\ntask-specific models from previous works. Moreover, the methods enable semantic\nparsing for questions where a part of the input needs to be copied to the\noutput query, thus enabling a new paradigm in KG semantic parsing.\n","authors":["Debayan Banerjee","Pranav Ajit Nair","Jivat Neet Kaur","Ricardo Usbeck","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2204.12793v3.pdf","comment":"5 pages, short paper, SIGIR 2022"},{"id":"http://arxiv.org/abs/2307.04408v2","updated":"2023-09-14T07:58:00Z","published":"2023-07-10T08:15:40Z","title":"TIM: Teaching Large Language Models to Translate with Comparison","summary":" Open-sourced large language models (LLMs) have demonstrated remarkable\nefficacy in various tasks with instruction tuning. However, these models can\nsometimes struggle with tasks that require more specialized knowledge such as\ntranslation. One possible reason for such deficiency is that instruction tuning\naims to generate fluent and coherent text that continues from a given\ninstruction without being constrained by any task-specific requirements.\nMoreover, it can be more challenging for tuning smaller LLMs with lower-quality\ntraining data. To address this issue, we propose a novel framework using\nexamples in comparison to teach LLMs to learn translation. Our approach\ninvolves presenting the model with examples of correct and incorrect\ntranslations and using a preference loss to guide the model's learning. We\nevaluate our method on WMT2022 test sets and show that it outperforms existing\nmethods. Our findings offer a new perspective on fine-tuning LLMs for\ntranslation tasks and provide a promising solution for generating high-quality\ntranslations. Please refer to Github for more details:\nhttps://github.com/lemon0830/TIM.\n","authors":["Jiali Zeng","Fandong Meng","Yongjing Yin","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.04408v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07478v1","updated":"2023-09-14T07:35:14Z","published":"2023-09-14T07:35:14Z","title":"Direct Text to Speech Translation System using Acoustic Units","summary":" This paper proposes a direct text to speech translation system using discrete\nacoustic units. This framework employs text in different source languages as\ninput to generate speech in the target language without the need for text\ntranscriptions in this language. Motivated by the success of acoustic units in\nprevious works for direct speech to speech translation systems, we use the same\npipeline to extract the acoustic units using a speech encoder combined with a\nclustering algorithm. Once units are obtained, an encoder-decoder architecture\nis trained to predict them. Then a vocoder generates speech from units. Our\napproach for direct text to speech translation was tested on the new CVSS\ncorpus with two different text mBART models employed as initialisation. The\nsystems presented report competitive performance for most of the language pairs\nevaluated. Besides, results show a remarkable improvement when initialising our\nproposed architecture with a model pre-trained with more languages.\n","authors":["Victoria Mingote","Pablo Gimeno","Luis Vicente","Sameer Khurana","Antoine Laurent","Jarod Duret"],"pdf_url":"https://arxiv.org/pdf/2309.07478v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.09768v2","updated":"2023-09-14T07:31:14Z","published":"2023-08-18T18:46:47Z","title":"YORC: Yoruba Reading Comprehension dataset","summary":" In this paper, we create YORC: a new multi-choice Yoruba Reading\nComprehension dataset that is based on Yoruba high-school reading comprehension\nexamination. We provide baseline results by performing cross-lingual transfer\nusing existing English RACE dataset based on a pre-trained encoder-only model.\nAdditionally, we provide results by prompting large language models (LLMs) like\nGPT-4.\n","authors":["Anuoluwapo Aremu","Jesujoba O. Alabi","David Ifeoluwa Adelani"],"pdf_url":"https://arxiv.org/pdf/2308.09768v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.00305v3","updated":"2023-09-14T07:06:03Z","published":"2022-10-01T16:01:53Z","title":"LambdaKG: A Library for Pre-trained Language Model-Based Knowledge Graph\n Embeddings","summary":" Knowledge Graphs (KGs) often have two characteristics: heterogeneous graph\nstructure and text-rich entity/relation information. Text-based KG embeddings\ncan represent entities by encoding descriptions with pre-trained language\nmodels, but no open-sourced library is specifically designed for KGs with PLMs\nat present. In this paper, we present LambdaKG, a library for KGE that equips\nwith many pre-trained language models (e.g., BERT, BART, T5, GPT-3), and\nsupports various tasks (e.g., knowledge graph completion, question answering,\nrecommendation, and knowledge probing). LambdaKG is publicly open-sourced at\nhttps://github.com/zjunlp/PromptKG/tree/main/lambdaKG, with a demo video at\nhttp://deepke.zjukg.cn/lambdakg.mp4 and long-term maintenance.\n","authors":["Xin Xie","Zhoubo Li","Xiaohan Wang","Zekun Xi","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.00305v3.pdf","comment":"AACL 2023 System Demonstrations, the project website is\n https://zjunlp.github.io/project/promptkg/"},{"id":"http://arxiv.org/abs/2309.07462v1","updated":"2023-09-14T06:41:58Z","published":"2023-09-14T06:41:58Z","title":"Are Large Language Model-based Evaluators the Solution to Scaling Up\n Multilingual Evaluation?","summary":" Large Language Models (LLMs) have demonstrated impressive performance on\nNatural Language Processing (NLP) tasks, such as Question Answering,\nSummarization, and Classification. The use of LLMs as evaluators, that can rank\nor score the output of other models (usually LLMs) has become increasingly\npopular, due to the limitations of current evaluation techniques including the\nlack of appropriate benchmarks, metrics, cost, and access to human annotators.\nWhile LLMs are capable of handling approximately 100 languages, the majority of\nlanguages beyond the top 20 lack systematic evaluation across various tasks,\nmetrics, and benchmarks. This creates an urgent need to scale up multilingual\nevaluation to ensure a precise understanding of LLM performance across diverse\nlanguages. LLM-based evaluators seem like the perfect solution to this problem,\nas they do not require human annotators, human-created references, or\nbenchmarks and can theoretically be used to evaluate any language covered by\nthe LLM. In this paper, we investigate whether LLM-based evaluators can help\nscale up multilingual evaluation. Specifically, we calibrate LLM-based\nevaluation against 20k human judgments of five metrics across three\ntext-generation tasks in eight languages. Our findings indicate that LLM-based\nevaluators may exhibit bias towards higher scores and should be used with\ncaution and should always be calibrated with a dataset of native speaker\njudgments, particularly in low-resource and non-Latin script languages.\n","authors":["Rishav Hada","Varun Gumma","Adrian de Wynter","Harshita Diddee","Mohamed Ahmed","Monojit Choudhury","Kalika Bali","Sunayana Sitaram"],"pdf_url":"https://arxiv.org/pdf/2309.07462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10932v2","updated":"2023-09-14T06:09:34Z","published":"2023-07-20T15:02:42Z","title":"Identical and Fraternal Twins: Fine-Grained Semantic Contrastive\n Learning of Sentence Representations","summary":" The enhancement of unsupervised learning of sentence representations has been\nsignificantly achieved by the utility of contrastive learning. This approach\nclusters the augmented positive instance with the anchor instance to create a\ndesired embedding space. However, relying solely on the contrastive objective\ncan result in sub-optimal outcomes due to its inability to differentiate subtle\nsemantic variations between positive pairs. Specifically, common data\naugmentation techniques frequently introduce semantic distortion, leading to a\nsemantic margin between the positive pair. While the InfoNCE loss function\noverlooks the semantic margin and prioritizes similarity maximization between\npositive pairs during training, leading to the insensitive semantic\ncomprehension ability of the trained model. In this paper, we introduce a novel\nIdentical and Fraternal Twins of Contrastive Learning (named IFTCL) framework,\ncapable of simultaneously adapting to various positive pairs generated by\ndifferent augmentation techniques. We propose a \\textit{Twins Loss} to preserve\nthe innate margin during training and promote the potential of data enhancement\nin order to overcome the sub-optimal issue. We also present proof-of-concept\nexperiments combined with the contrastive objective to prove the validity of\nthe proposed Twins Loss. Furthermore, we propose a hippocampus queue mechanism\nto restore and reuse the negative instances without additional calculation,\nwhich further enhances the efficiency and performance of the IFCL. We verify\nthe IFCL framework on nine semantic textual similarity tasks with both English\nand Chinese datasets, and the experimental results show that IFCL outperforms\nstate-of-the-art methods.\n","authors":["Qingfa Xiao","Shuangyin Li","Lei Chen"],"pdf_url":"https://arxiv.org/pdf/2307.10932v2.pdf","comment":"This article has been accepted for publication in European Conference\n on Artificial Intelligence (ECAI2023). 9 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.07445v1","updated":"2023-09-14T05:56:49Z","published":"2023-09-14T05:56:49Z","title":"SIB-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic\n Classification in 200+ Languages and Dialects","summary":" Despite the progress we have recorded in the last few years in multilingual\nnatural language processing, evaluation is typically limited to a small set of\nlanguages with available datasets which excludes a large number of low-resource\nlanguages. In this paper, we created SIB-200 -- a large-scale open-sourced\nbenchmark dataset for topic classification in 200 languages and dialects to\naddress the lack of evaluation dataset for Natural Language Understanding\n(NLU). For many of the languages covered in SIB-200, this is the first publicly\navailable evaluation dataset for NLU. The dataset is based on Flores-200\nmachine translation corpus. We annotated the English portion of the dataset and\nextended the sentence-level annotation to the remaining 203 languages covered\nin the corpus. Despite the simplicity of this task, our evaluation in\nfull-supervised setting, cross-lingual transfer setting and prompting of large\nlanguage model setting show that there is still a large gap between the\nperformance of high-resource and low-resource languages when multilingual\nevaluation is scaled to numerous world languages. We found that languages\nunseen during the pre-training of multilingual language models,\nunder-represented language families (like Nilotic and Altantic-Congo), and\nlanguages from the regions of Africa, Americas, Oceania and South East Asia,\noften have the lowest performance on our topic classification dataset. We hope\nour dataset will encourage a more inclusive evaluation of multilingual language\nmodels on a more diverse set of languages. https://github.com/dadelani/sib-200\n","authors":["David Ifeoluwa Adelani","Hannah Liu","Xiaoyu Shen","Nikita Vassilyev","Jesujoba O. Alabi","Yanke Mao","Haonan Gao","Annie En-Shiun Lee"],"pdf_url":"https://arxiv.org/pdf/2309.07445v1.pdf","comment":"under submission"},{"id":"http://arxiv.org/abs/2309.03787v2","updated":"2023-09-14T05:53:45Z","published":"2023-09-07T15:35:00Z","title":"USA: Universal Sentiment Analysis Model & Construction of Japanese\n Sentiment Text Classification and Part of Speech Dataset","summary":" Sentiment analysis is a pivotal task in the domain of natural language\nprocessing. It encompasses both text-level sentiment polarity classification\nand word-level Part of Speech(POS) sentiment polarity determination. Such\nanalysis challenges models to understand text holistically while also\nextracting nuanced information. With the rise of Large Language Models(LLMs),\nnew avenues for sentiment analysis have opened. This paper proposes enhancing\nperformance by leveraging the Mutual Reinforcement Effect(MRE) between\nindividual words and the overall text. It delves into how word polarity\ninfluences the overarching sentiment of a passage. To support our research, we\nannotated four novel Sentiment Text Classification and Part of Speech(SCPOS)\ndatasets, building upon existing sentiment classification datasets.\nFurthermore, we developed a Universal Sentiment Analysis(USA) model, with a\n7-billion parameter size. Experimental results revealed that our model\nsurpassed the performance of gpt-3.5-turbo across all four datasets,\nunderscoring the significance of MRE in sentiment analysis.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2309.03787v2.pdf","comment":"Model already Open Sourced, Dataset will release soon"},{"id":"http://arxiv.org/abs/2212.09597v7","updated":"2023-09-14T05:36:15Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":" Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v7.pdf","comment":"ACL 2023, 24 pages, add references of theoretical analysis"},{"id":"http://arxiv.org/abs/2309.07430v1","updated":"2023-09-14T05:15:01Z","published":"2023-09-14T05:15:01Z","title":"Clinical Text Summarization: Adapting Large Language Models Can\n Outperform Human Experts","summary":" Sifting through vast textual data and summarizing key information imposes a\nsubstantial burden on how clinicians allocate their time. Although large\nlanguage models (LLMs) have shown immense promise in natural language\nprocessing (NLP) tasks, their efficacy across diverse clinical summarization\ntasks has not yet been rigorously examined. In this work, we employ domain\nadaptation methods on eight LLMs, spanning six datasets and four distinct\nsummarization tasks: radiology reports, patient questions, progress notes, and\ndoctor-patient dialogue. Our thorough quantitative assessment reveals\ntrade-offs between models and adaptation methods in addition to instances where\nrecent advances in LLMs may not lead to improved results. Further, in a\nclinical reader study with six physicians, we depict that summaries from the\nbest adapted LLM are preferable to human summaries in terms of completeness and\ncorrectness. Our ensuing qualitative analysis delineates mutual challenges\nfaced by both LLMs and human experts. Lastly, we correlate traditional\nquantitative NLP metrics with reader study scores to enhance our understanding\nof how these metrics align with physician preferences. Our research marks the\nfirst evidence of LLMs outperforming human experts in clinical text\nsummarization across multiple tasks. This implies that integrating LLMs into\nclinical workflows could alleviate documentation burden, empowering clinicians\nto focus more on personalized patient care and other irreplaceable human\naspects of medicine.\n","authors":["Dave Van Veen","Cara Van Uden","Louis Blankemeier","Jean-Benoit Delbrouck","Asad Aali","Christian Bluethgen","Anuj Pareek","Malgorzata Polacin","William Collins","Neera Ahuja","Curtis P. Langlotz","Jason Hom","Sergios Gatidis","John Pauly","Akshay S. Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2309.07430v1.pdf","comment":"23 pages, 22 figures"},{"id":"http://arxiv.org/abs/2309.07429v1","updated":"2023-09-14T05:03:09Z","published":"2023-09-14T05:03:09Z","title":"Semantic Parsing in Limited Resource Conditions","summary":" This thesis explores challenges in semantic parsing, specifically focusing on\nscenarios with limited data and computational resources. It offers solutions\nusing techniques like automatic data curation, knowledge transfer, active\nlearning, and continual learning.\n For tasks with no parallel training data, the thesis proposes generating\nsynthetic training examples from structured database schemas. When there is\nabundant data in a source domain but limited parallel data in a target domain,\nknowledge from the source is leveraged to improve parsing in the target domain.\n For multilingual situations with limited data in the target languages, the\nthesis introduces a method to adapt parsers using a limited human translation\nbudget. Active learning is applied to select source-language samples for manual\ntranslation, maximizing parser performance in the target language. In addition,\nan alternative method is also proposed to utilize machine translation services,\nsupplemented by human-translated data, to train a more effective parser.\n When computational resources are limited, a continual learning approach is\nintroduced to minimize training time and computational memory. This maintains\nthe parser's efficiency in previously learned tasks while adapting it to new\ntasks, mitigating the problem of catastrophic forgetting.\n Overall, the thesis provides a comprehensive set of methods to improve\nsemantic parsing in resource-constrained conditions.\n","authors":["Zhuang Li"],"pdf_url":"https://arxiv.org/pdf/2309.07429v1.pdf","comment":"PhD thesis, year of award 2023, 172 pages"},{"id":"http://arxiv.org/abs/2309.07423v1","updated":"2023-09-14T04:36:00Z","published":"2023-09-14T04:36:00Z","title":"ChatGPT MT: Competitive for High- (but not Low-) Resource Languages","summary":" Large language models (LLMs) implicitly learn to perform a range of language\ntasks, including machine translation (MT). Previous studies explore aspects of\nLLMs' MT capabilities. However, there exist a wide variety of languages for\nwhich recent LLM MT performance has never before been evaluated. Without\npublished experimental evidence on the matter, it is difficult for speakers of\nthe world's diverse languages to know how and whether they can use LLMs for\ntheir languages. We present the first experimental evidence for an expansive\nset of 204 languages, along with MT cost analysis, using the FLORES-200\nbenchmark. Trends reveal that GPT models approach or exceed traditional MT\nmodel performance for some high-resource languages (HRLs) but consistently lag\nfor low-resource languages (LRLs), under-performing traditional MT for 84.1% of\nlanguages we covered. Our analysis reveals that a language's resource level is\nthe most important feature in determining ChatGPT's relative ability to\ntranslate it, and suggests that ChatGPT is especially disadvantaged for LRLs\nand African languages.\n","authors":["Nathaniel R. Robinson","Perez Ogayo","David R. Mortensen","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2309.07423v1.pdf","comment":"27 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2304.08981v2","updated":"2023-09-14T04:03:28Z","published":"2023-04-18T13:23:42Z","title":"MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised\n Learning","summary":" The first Multimodal Emotion Recognition Challenge (MER 2023) was\nsuccessfully held at ACM Multimedia. The challenge focuses on system robustness\nand consists of three distinct tracks: (1) MER-MULTI, where participants are\nrequired to recognize both discrete and dimensional emotions; (2) MER-NOISE, in\nwhich noise is added to test videos for modality robustness evaluation; (3)\nMER-SEMI, which provides a large amount of unlabeled samples for\nsemi-supervised learning. In this paper, we introduce the motivation behind\nthis challenge, describe the benchmark dataset, and provide some statistics\nabout participants. To continue using this dataset after MER 2023, please sign\na new End User License Agreement and send it to our official email address\nmerchallenge.contact@gmail.com. We believe this high-quality dataset can become\na new benchmark in multimodal emotion recognition, especially for the Chinese\nresearch community.\n","authors":["Zheng Lian","Haiyang Sun","Licai Sun","Kang Chen","Mingyu Xu","Kexin Wang","Ke Xu","Yu He","Ying Li","Jinming Zhao","Ye Liu","Bin Liu","Jiangyan Yi","Meng Wang","Erik Cambria","Guoying Zhao","Björn W. Schuller","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2304.08981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07414v1","updated":"2023-09-14T03:43:07Z","published":"2023-09-14T03:43:07Z","title":"PromptASR for contextualized ASR with controllable style","summary":" Prompts are crucial to large language models as they provide context\ninformation such as topic or logical relationships. Inspired by this, we\npropose PromptASR, a framework that integrates prompts in end-to-end automatic\nspeech recognition (E2E ASR) systems to achieve contextualized ASR with\ncontrollable style of transcriptions. Specifically, a dedicated text encoder\nencodes the text prompts and the encodings are injected into the speech encoder\nby cross-attending the features from two modalities. When using the ground\ntruth text from preceding utterances as content prompt, the proposed system\nachieves 21.9% and 6.8% relative word error rate reductions on a book reading\ndataset and an in-house dataset compared to a baseline ASR system. The system\ncan also take word-level biasing lists as prompt to improve recognition\naccuracy on rare words. An additional style prompt can be given to the text\nencoder and guide the ASR system to output different styles of transcriptions.\nThe code is available at icefall.\n","authors":["Xiaoyu Yang","Wei Kang","Zengwei Yao","Yifan Yang","Liyong Guo","Fangjun Kuang","Long Lin","Daniel Povey"],"pdf_url":"https://arxiv.org/pdf/2309.07414v1.pdf","comment":"Submitted to ICASSP2024"},{"id":"http://arxiv.org/abs/2309.07413v1","updated":"2023-09-14T03:40:14Z","published":"2023-09-14T03:40:14Z","title":"CPPF: A contextual and post-processing-free model for automatic speech\n recognition","summary":" ASR systems have become increasingly widespread in recent years. However,\ntheir textual outputs often require post-processing tasks before they can be\npractically utilized. To address this issue, we draw inspiration from the\nmultifaceted capabilities of LLMs and Whisper, and focus on integrating\nmultiple ASR text processing tasks related to speech recognition into the ASR\nmodel. This integration not only shortens the multi-stage pipeline, but also\nprevents the propagation of cascading errors, resulting in direct generation of\npost-processed text. In this study, we focus on ASR-related processing tasks,\nincluding Contextual ASR and multiple ASR post processing tasks. To achieve\nthis objective, we introduce the CPPF model, which offers a versatile and\nhighly effective alternative to ASR processing. CPPF seamlessly integrates\nthese tasks without any significant loss in recognition performance.\n","authors":["Lei Zhang","Zhengkun Tian","Xiang Chen","Jiaming Sun","Hongyu Xiang","Ke Ding","Guanglu Wan"],"pdf_url":"https://arxiv.org/pdf/2309.07413v1.pdf","comment":"Submitted to ICASSP2024"},{"id":"http://arxiv.org/abs/2309.07412v1","updated":"2023-09-14T03:36:01Z","published":"2023-09-14T03:36:01Z","title":"Advancing Regular Language Reasoning in Linear Recurrent Neural Networks","summary":" In recent studies, linear recurrent neural networks (LRNNs) have achieved\nTransformer-level performance in natural language modeling and long-range\nmodeling while offering rapid parallel training and constant inference costs.\nWith the resurged interest in LRNNs, we study whether they can learn the hidden\nrules in training sequences, such as the grammatical structures of regular\nlanguage. We theoretically analyze some existing LRNNs and discover their\nlimitations on regular language. Motivated by the analysis, we propose a new\nLRNN equipped with a block-diagonal and input-dependent transition matrix.\nExperiments suggest that the proposed model is the only LRNN that can perform\nlength extrapolation on regular language tasks such as Sum, Even Pair, and\nModular Arithmetic.\n","authors":["Ting-Han Fan","Ta-Chung Chi","Alexander I. Rudnicky"],"pdf_url":"https://arxiv.org/pdf/2309.07412v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2306.05659v3","updated":"2023-09-14T03:23:34Z","published":"2023-06-09T03:53:42Z","title":"COVER: A Heuristic Greedy Adversarial Attack on Prompt-based Learning in\n Language Models","summary":" Prompt-based learning has been proved to be an effective way in pre-trained\nlanguage models (PLMs), especially in low-resource scenarios like few-shot\nsettings. However, the trustworthiness of PLMs is of paramount significance and\npotential vulnerabilities have been shown in prompt-based templates that could\nmislead the predictions of language models, causing serious security concerns.\nIn this paper, we will shed light on some vulnerabilities of PLMs, by proposing\na prompt-based adversarial attack on manual templates in black box scenarios.\nFirst of all, we design character-level and word-level heuristic approaches to\nbreak manual templates separately. Then we present a greedy algorithm for the\nattack based on the above heuristic destructive approaches. Finally, we\nevaluate our approach with the classification tasks on three variants of BERT\nseries models and eight datasets. And comprehensive experimental results\njustify the effectiveness of our approach in terms of attack success rate and\nattack speed.\n","authors":["Zihao Tan","Qingliang Chen","Wenbin Zhu","Yongjian Huang"],"pdf_url":"https://arxiv.org/pdf/2306.05659v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07396v1","updated":"2023-09-14T02:43:34Z","published":"2023-09-14T02:43:34Z","title":"DebCSE: Rethinking Unsupervised Contrastive Sentence Embedding Learning\n in the Debiasing Perspective","summary":" Several prior studies have suggested that word frequency biases can cause the\nBert model to learn indistinguishable sentence embeddings. Contrastive learning\nschemes such as SimCSE and ConSERT have already been adopted successfully in\nunsupervised sentence embedding to improve the quality of embeddings by\nreducing this bias. However, these methods still introduce new biases such as\nsentence length bias and false negative sample bias, that hinders model's\nability to learn more fine-grained semantics. In this paper, we reexamine the\nchallenges of contrastive sentence embedding learning from a debiasing\nperspective and argue that effectively eliminating the influence of various\nbiases is crucial for learning high-quality sentence embeddings. We think all\nthose biases are introduced by simple rules for constructing training data in\ncontrastive learning and the key for contrastive learning sentence embedding is\nto mimic the distribution of training data in supervised machine learning in\nunsupervised way. We propose a novel contrastive framework for sentence\nembedding, termed DebCSE, which can eliminate the impact of these biases by an\ninverse propensity weighted sampling method to select high-quality positive and\nnegative pairs according to both the surface and semantic similarity between\nsentences. Extensive experiments on semantic textual similarity (STS)\nbenchmarks reveal that DebCSE significantly outperforms the latest\nstate-of-the-art models with an average Spearman's correlation coefficient of\n80.33% on BERTbase.\n","authors":["Pu Miao","Zeyao Du","Junlin Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07387v1","updated":"2023-09-14T02:09:20Z","published":"2023-09-14T02:09:20Z","title":"VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue","summary":" Visually-grounded dialog systems, which integrate multiple modes of\ncommunication such as text and visual inputs, have become an increasingly\npopular area of investigation. However, the absence of a standardized\nevaluation framework poses a challenge in assessing the development of this\nfield. To this end, we propose \\textbf{VDialogUE}, a \\textbf{V}isually-grounded\n\\textbf{Dialog}ue benchmark for \\textbf{U}nified \\textbf{E}valuation. It\ndefines five core multi-modal dialogue tasks and covers six datasets.\nFurthermore, in order to provide a comprehensive assessment of the model's\nperformance across all tasks, we developed a novel evaluation metric called\nVDscore, which is based on the Analytic Hierarchy Process~(AHP) method.\nAdditionally, we present a straightforward yet efficient baseline model, named\n\\textbf{VISIT}~(\\textbf{VIS}ually-grounded d\\textbf{I}alog\n\\textbf{T}ransformer), to promote the advancement of general multi-modal\ndialogue systems. It progressively builds its multi-modal foundation and\ndialogue capability via a two-stage pre-training strategy.\n We believe that the VDialogUE benchmark, along with the evaluation scripts\nand our baseline models, will accelerate the development of visually-grounded\ndialog systems and lead to the development of more sophisticated and effective\npre-trained models.\n","authors":["Yunshui Li","Binyuan Hui","Zhaochao Yin","Wanwei He","Run Luo","Yuxing Long","Min Yang","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2309.07387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07384v1","updated":"2023-09-14T02:03:45Z","published":"2023-09-14T02:03:45Z","title":"An Interactive Framework for Profiling News Media Sources","summary":" The recent rise of social media has led to the spread of large amounts of\nfake and biased news, content published with the intent to sway beliefs. While\ndetecting and profiling the sources that spread this news is important to\nmaintain a healthy society, it is challenging for automated systems.\n In this paper, we propose an interactive framework for news media profiling.\nIt combines the strengths of graph based news media profiling models,\nPre-trained Large Language Models, and human insight to characterize the social\ncontext on social media. Experimental results show that with as little as 5\nhuman interactions, our framework can rapidly detect fake and biased news\nmedia, even in the most challenging settings of emerging news events, where\ntest data is unseen.\n","authors":["Nikhil Mehta","Dan Goldwasser"],"pdf_url":"https://arxiv.org/pdf/2309.07384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07382v1","updated":"2023-09-14T01:59:15Z","published":"2023-09-14T01:59:15Z","title":"Less is More for Long Document Summary Evaluation by LLMs","summary":" Large Language Models (LLMs) have shown promising performance in summary\nevaluation tasks, yet they face challenges such as high computational costs and\nthe Lost-in-the-Middle problem where important information in the middle of\nlong documents is often overlooked. To address these issues, this paper\nintroduces a novel approach, Extract-then-Evaluate, which involves extracting\nkey sentences from a long source document and then evaluating the summary by\nprompting LLMs. The results reveal that the proposed method not only\nsignificantly reduces evaluation costs but also exhibits a higher correlation\nwith human evaluations. Furthermore, we provide practical recommendations for\noptimal document length and sentence extraction methods, contributing to the\ndevelopment of cost-effective yet more accurate methods for LLM-based text\ngeneration evaluation.\n","authors":["Yunshu Wu","Hayate Iso","Pouya Pezeshkpour","Nikita Bhutani","Estevam Hruschka"],"pdf_url":"https://arxiv.org/pdf/2309.07382v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.12794v3","updated":"2023-09-14T01:33:36Z","published":"2023-06-22T10:50:23Z","title":"Overview of Robust and Multilingual Automatic Evaluation Metrics for\n Open-Domain Dialogue Systems at DSTC 11 Track 4","summary":" The advent and fast development of neural networks have revolutionized the\nresearch on dialogue systems and subsequently have triggered various challenges\nregarding their automatic evaluation. Automatic evaluation of open-domain\ndialogue systems as an open challenge has been the center of the attention of\nmany researchers. Despite the consistent efforts to improve automatic metrics'\ncorrelations with human evaluation, there have been very few attempts to assess\ntheir robustness over multiple domains and dimensions. Also, their focus is\nmainly on the English language. All of these challenges prompt the development\nof automatic evaluation metrics that are reliable in various domains,\ndimensions, and languages. This track in the 11th Dialogue System Technology\nChallenge (DSTC11) is part of the ongoing effort to promote robust and\nmultilingual automatic evaluation metrics. This article describes the datasets\nand baselines provided to participants and discusses the submission and result\ndetails of the two proposed subtasks.\n","authors":["Mario Rodríguez-Cantelar","Chen Zhang","Chengguang Tang","Ke Shi","Sarik Ghazarian","João Sedoc","Luis Fernando D'Haro","Alexander Rudnicky"],"pdf_url":"https://arxiv.org/pdf/2306.12794v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07369v1","updated":"2023-09-14T01:07:36Z","published":"2023-09-14T01:07:36Z","title":"Hybrid Attention-based Encoder-decoder Model for Efficient Language\n Model Adaptation","summary":" Attention-based encoder-decoder (AED) speech recognition model has been\nwidely successful in recent years. However, the joint optimization of acoustic\nmodel and language model in end-to-end manner has created challenges for text\nadaptation. In particular, effectively, quickly and inexpensively adapting text\nhas become a primary concern for deploying AED systems in industry. To address\nthis issue, we propose a novel model, the hybrid attention-based\nencoder-decoder (HAED) speech recognition model that preserves the modularity\nof conventional hybrid automatic speech recognition systems. Our HAED model\nseparates the acoustic and language models, allowing for the use of\nconventional text-based language model adaptation techniques. We demonstrate\nthat the proposed HAED model yields 21\\% Word Error Rate (WER) improvements in\nrelative when out-of-domain text data is used for language model adaptation,\nand with only a minor degradation in WER on a general test set compared with\nconventional AED model.\n","authors":["Shaoshi Ling","Guoli Ye","Rui Zhao","Yifan Gong"],"pdf_url":"https://arxiv.org/pdf/2309.07369v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2309.07920v1","updated":"2023-09-14T17:59:53Z","published":"2023-09-14T17:59:53Z","title":"Large-Vocabulary 3D Diffusion Model with Transformer","summary":" Creating diverse and high-quality 3D assets with an automatic generative\nmodel is highly desirable. Despite extensive efforts on 3D generation, most\nexisting works focus on the generation of a single category or a few\ncategories. In this paper, we introduce a diffusion-based feed-forward\nframework for synthesizing massive categories of real-world 3D objects with a\nsingle generative model. Notably, there are three major challenges for this\nlarge-vocabulary 3D generation: a) the need for expressive yet efficient 3D\nrepresentation; b) large diversity in geometry and texture across categories;\nc) complexity in the appearances of real-world objects. To this end, we propose\na novel triplane-based 3D-aware Diffusion model with TransFormer, DiffTF, for\nhandling challenges via three aspects. 1) Considering efficiency and\nrobustness, we adopt a revised triplane representation and improve the fitting\nspeed and accuracy. 2) To handle the drastic variations in geometry and\ntexture, we regard the features of all 3D objects as a combination of\ngeneralized 3D knowledge and specialized 3D features. To extract generalized 3D\nknowledge from diverse categories, we propose a novel 3D-aware transformer with\nshared cross-plane attention. It learns the cross-plane relations across\ndifferent planes and aggregates the generalized 3D knowledge with specialized\n3D features. 3) In addition, we devise the 3D-aware encoder/decoder to enhance\nthe generalized 3D knowledge in the encoded triplanes for handling categories\nwith complex appearances. Extensive experiments on ShapeNet and OmniObject3D\n(over 200 diverse real-world categories) convincingly demonstrate that a single\nDiffTF model achieves state-of-the-art large-vocabulary 3D object generation\nperformance with large diversity, rich semantics, and high quality.\n","authors":["Ziang Cao","Fangzhou Hong","Tong Wu","Liang Pan","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.07920v1.pdf","comment":"Project page at https://ziangcao0312.github.io/difftf_pages/"},{"id":"http://arxiv.org/abs/2309.07921v1","updated":"2023-09-14T17:59:53Z","published":"2023-09-14T17:59:53Z","title":"OpenIllumination: A Multi-Illumination Dataset for Inverse Rendering\n Evaluation on Real Objects","summary":" We introduce OpenIllumination, a real-world dataset containing over 108K\nimages of 64 objects with diverse materials, captured under 72 camera views and\na large number of different illuminations. For each image in the dataset, we\nprovide accurate camera parameters, illumination ground truth, and foreground\nsegmentation masks. Our dataset enables the quantitative evaluation of most\ninverse rendering and material decomposition methods for real objects. We\nexamine several state-of-the-art inverse rendering methods on our dataset and\ncompare their performances. The dataset and code can be found on the project\npage: https://oppo-us-research.github.io/OpenIllumination.\n","authors":["Isabella Liu","Linghao Chen","Ziyang Fu","Liwen Wu","Haian Jin","Zhong Li","Chin Ming Ryan Wong","Yi Xu","Ravi Ramamoorthi","Zexiang Xu","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2309.07921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07918v1","updated":"2023-09-14T17:59:49Z","published":"2023-09-14T17:59:49Z","title":"Unified Human-Scene Interaction via Prompted Chain-of-Contacts","summary":" Human-Scene Interaction (HSI) is a vital component of fields like embodied AI\nand virtual reality. Despite advancements in motion quality and physical\nplausibility, two pivotal factors, versatile interaction control and the\ndevelopment of a user-friendly interface, require further exploration before\nthe practical application of HSI. This paper presents a unified HSI framework,\nUniHSI, which supports unified control of diverse interactions through language\ncommands. This framework is built upon the definition of interaction as Chain\nof Contacts (CoC): steps of human joint-object part pairs, which is inspired by\nthe strong correlation between interaction types and human-object contact\nregions. Based on the definition, UniHSI constitutes a Large Language Model\n(LLM) Planner to translate language prompts into task plans in the form of CoC,\nand a Unified Controller that turns CoC into uniform task execution. To\nfacilitate training and evaluation, we collect a new dataset named ScenePlan\nthat encompasses thousands of task plans generated by LLMs based on diverse\nscenarios. Comprehensive experiments demonstrate the effectiveness of our\nframework in versatile task execution and generalizability to real scanned\nscenes. The project page is at https://github.com/OpenRobotLab/UniHSI .\n","authors":["Zeqi Xiao","Tai Wang","Jingbo Wang","Jinkun Cao","Wenwei Zhang","Bo Dai","Dahua Lin","Jiangmiao Pang"],"pdf_url":"https://arxiv.org/pdf/2309.07918v1.pdf","comment":"A unified Human-Scene Interaction framework that supports versatile\n interactions through language commands.Project URL:\n https://github.com/OpenRobotLab/UniHSI .Please ignore the header of the paper"},{"id":"http://arxiv.org/abs/2309.07917v1","updated":"2023-09-14T17:59:48Z","published":"2023-09-14T17:59:48Z","title":"Looking at words and points with attention: a benchmark for\n text-to-shape coherence","summary":" While text-conditional 3D object generation and manipulation have seen rapid\nprogress, the evaluation of coherence between generated 3D shapes and input\ntextual descriptions lacks a clear benchmark. The reason is twofold: a) the low\nquality of the textual descriptions in the only publicly available dataset of\ntext-shape pairs; b) the limited effectiveness of the metrics used to\nquantitatively assess such coherence. In this paper, we propose a comprehensive\nsolution that addresses both weaknesses. Firstly, we employ large language\nmodels to automatically refine textual descriptions associated with shapes.\nSecondly, we propose a quantitative metric to assess text-to-shape coherence,\nthrough cross-attention mechanisms. To validate our approach, we conduct a user\nstudy and compare quantitatively our metric with existing ones. The refined\ndataset, the new metric and a set of text-shape pairs validated by the user\nstudy comprise a novel, fine-grained benchmark that we publicly release to\nfoster research on text-to-shape coherence of text-conditioned 3D generative\nmodels. Benchmark available at\nhttps://cvlab-unibo.github.io/CrossCoherence-Web/.\n","authors":["Andrea Amaduzzi","Giuseppe Lisanti","Samuele Salti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2309.07917v1.pdf","comment":"ICCV 2023 Workshop \"AI for 3D Content Creation\", Project page:\n https://cvlab-unibo.github.io/CrossCoherence-Web/, 26 pages"},{"id":"http://arxiv.org/abs/2309.07915v1","updated":"2023-09-14T17:59:17Z","published":"2023-09-14T17:59:17Z","title":"MMICL: Empowering Vision-language Model with Multi-Modal In-Context\n Learning","summary":" Starting from the resurgence of deep learning, vision-language models (VLMs)\nbenefiting from large language models (LLMs) have never been so popular.\nHowever, while LLMs can utilize extensive background knowledge and task\ninformation with in-context learning, most VLMs still struggle with\nunderstanding complex multi-modal prompts with multiple images. The issue can\ntraced back to the architectural design of VLMs or pre-training data.\nSpecifically, the current VLMs primarily emphasize utilizing multi-modal data\nwith a single image some, rather than multi-modal prompts with interleaved\nmultiple images and text. Even though some newly proposed VLMs could handle\nuser prompts with multiple images, pre-training data does not provide more\nsophisticated multi-modal prompts than interleaved image and text crawled from\nthe web. We propose MMICL to address the issue by considering both the model\nand data perspectives. We introduce a well-designed architecture capable of\nseamlessly integrating visual and textual context in an interleaved manner and\nMIC dataset to reduce the gap between the training data and the complex user\nprompts in real-world applications, including: 1) multi-modal context with\ninterleaved images and text, 2) textual references for each image, and 3)\nmulti-image data with spatial, logical, or temporal relationships. Our\nexperiments confirm that MMICL achieves new stat-of-the-art zero-shot and\nfew-shot performance on a wide range of general vision-language tasks,\nespecially for complex reasoning benchmarks including MME and MMBench. Our\nanalysis demonstrates that MMICL effectively deals with the challenge of\ncomplex multi-modal prompt understanding. The experiments on ScienceQA-IMG also\nshow that MMICL successfully alleviates the issue of language bias in VLMs,\nwhich we believe is the reason behind the advanced performance of MMICL.\n","authors":["Haozhe Zhao","Zefan Cai","Shuzheng Si","Xiaojian Ma","Kaikai An","Liang Chen","Zixuan Liu","Sheng Wang","Wenjuan Han","Baobao Chang"],"pdf_url":"https://arxiv.org/pdf/2309.07915v1.pdf","comment":"Code, dataset, checkpoints, and demos are available at\n \\href{https://github.com/HaozheZhao/MIC}{https://github.com/HaozheZhao/MIC}"},{"id":"http://arxiv.org/abs/2309.07914v1","updated":"2023-09-14T17:59:05Z","published":"2023-09-14T17:59:05Z","title":"ALWOD: Active Learning for Weakly-Supervised Object Detection","summary":" Object detection (OD), a crucial vision task, remains challenged by the lack\nof large training datasets with precise object localization labels. In this\nwork, we propose ALWOD, a new framework that addresses this problem by fusing\nactive learning (AL) with weakly and semi-supervised object detection\nparadigms. Because the performance of AL critically depends on the model\ninitialization, we propose a new auxiliary image generator strategy that\nutilizes an extremely small labeled set, coupled with a large weakly tagged set\nof images, as a warm-start for AL. We then propose a new AL acquisition\nfunction, another critical factor in AL success, that leverages the\nstudent-teacher OD pair disagreement and uncertainty to effectively propose the\nmost informative images to annotate. Finally, to complete the AL loop, we\nintroduce a new labeling task delegated to human annotators, based on selection\nand correction of model-proposed detections, which is both rapid and effective\nin labeling the informative images. We demonstrate, across several challenging\nbenchmarks, that ALWOD significantly narrows the gap between the ODs trained on\nfew partially labeled but strategically selected image instances and those that\nrely on the fully-labeled data. Our code is publicly available on\nhttps://github.com/seqam-lab/ALWOD.\n","authors":["Yuting Wang","Velibor Ilic","Jiatong Li","Branislav Kisacanin","Vladimir Pavlovic"],"pdf_url":"https://arxiv.org/pdf/2309.07914v1.pdf","comment":"published in ICCV 2023"},{"id":"http://arxiv.org/abs/2309.07911v1","updated":"2023-09-14T17:58:33Z","published":"2023-09-14T17:58:33Z","title":"Disentangling Spatial and Temporal Learning for Efficient Image-to-Video\n Transfer Learning","summary":" Recently, large-scale pre-trained language-image models like CLIP have shown\nextraordinary capabilities for understanding spatial contents, but naively\ntransferring such models to video recognition still suffers from unsatisfactory\ntemporal modeling capabilities. Existing methods insert tunable structures into\nor in parallel with the pre-trained model, which either requires\nback-propagation through the whole pre-trained model and is thus\nresource-demanding, or is limited by the temporal reasoning capability of the\npre-trained structure. In this work, we present DiST, which disentangles the\nlearning of spatial and temporal aspects of videos. Specifically, DiST uses a\ndual-encoder structure, where a pre-trained foundation model acts as the\nspatial encoder, and a lightweight network is introduced as the temporal\nencoder. An integration branch is inserted between the encoders to fuse\nspatio-temporal information. The disentangled spatial and temporal learning in\nDiST is highly efficient because it avoids the back-propagation of massive\npre-trained parameters. Meanwhile, we empirically show that disentangled\nlearning with an extra network for integration benefits both spatial and\ntemporal understanding. Extensive experiments on five benchmarks show that DiST\ndelivers better performance than existing state-of-the-art methods by\nconvincing gaps. When pre-training on the large-scale Kinetics-710, we achieve\n89.7% on Kinetics-400 with a frozen ViT-L model, which verifies the scalability\nof DiST. Codes and models can be found in\nhttps://github.com/alibaba-mmai-research/DiST.\n","authors":["Zhiwu Qing","Shiwei Zhang","Ziyuan Huang","Yingya Zhang","Changxin Gao","Deli Zhao","Nong Sang"],"pdf_url":"https://arxiv.org/pdf/2309.07911v1.pdf","comment":"ICCV2023. Code: https://github.com/alibaba-mmai-research/DiST"},{"id":"http://arxiv.org/abs/2304.10520v2","updated":"2023-09-14T17:57:55Z","published":"2023-04-20T17:51:09Z","title":"Contrastive Tuning: A Little Help to Make Masked Autoencoders Forget","summary":" Masked Image Modeling (MIM) methods, like Masked Autoencoders (MAE),\nefficiently learn a rich representation of the input. However, for adapting to\ndownstream tasks, they require a sufficient amount of labeled data since their\nrich features code not only objects but also less relevant image background. In\ncontrast, Instance Discrimination (ID) methods focus on objects. In this work,\nwe study how to combine the efficiency and scalability of MIM with the ability\nof ID to perform downstream classification in the absence of large amounts of\nlabeled data. To this end, we introduce Masked Autoencoder Contrastive Tuning\n(MAE-CT), a sequential approach that utilizes the implicit clustering of the\nNearest Neighbor Contrastive Learning (NNCLR) objective to induce abstraction\nin the topmost layers of a pre-trained MAE. MAE-CT tunes the rich features such\nthat they form semantic clusters of objects without using any labels. Notably,\nMAE-CT does not rely on hand-crafted augmentations and frequently achieves its\nbest performances while using only minimal augmentations (crop & flip).\nFurther, MAE-CT is compute efficient as it requires at most 10% overhead\ncompared to MAE re-training. Applied to large and huge Vision Transformer (ViT)\nmodels, MAE-CT excels over previous self-supervised methods trained on ImageNet\nin linear probing, k-NN and low-shot classification accuracy as well as in\nunsupervised clustering accuracy. With ViT-H/16 MAE-CT achieves a new\nstate-of-the-art in linear probing of 82.2%.\n","authors":["Johannes Lehner","Benedikt Alkin","Andreas Fürst","Elisabeth Rumetshofer","Lukas Miklautz","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2304.10520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07910v1","updated":"2023-09-14T17:56:30Z","published":"2023-09-14T17:56:30Z","title":"TEMPO: Efficient Multi-View Pose Estimation, Tracking, and Forecasting","summary":" Existing volumetric methods for predicting 3D human pose estimation are\naccurate, but computationally expensive and optimized for single time-step\nprediction. We present TEMPO, an efficient multi-view pose estimation model\nthat learns a robust spatiotemporal representation, improving pose accuracy\nwhile also tracking and forecasting human pose. We significantly reduce\ncomputation compared to the state-of-the-art by recurrently computing\nper-person 2D pose features, fusing both spatial and temporal information into\na single representation. In doing so, our model is able to use spatiotemporal\ncontext to predict more accurate human poses without sacrificing efficiency. We\nfurther use this representation to track human poses over time as well as\npredict future poses. Finally, we demonstrate that our model is able to\ngeneralize across datasets without scene-specific fine-tuning. TEMPO achieves\n10$\\%$ better MPJPE with a 33$\\times$ improvement in FPS compared to TesseTrack\non the challenging CMU Panoptic Studio dataset.\n","authors":["Rohan Choudhury","Kris Kitani","Laszlo A. Jeni"],"pdf_url":"https://arxiv.org/pdf/2309.07910v1.pdf","comment":"Accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2309.07907v1","updated":"2023-09-14T17:55:18Z","published":"2023-09-14T17:55:18Z","title":"Physically Plausible Full-Body Hand-Object Interaction Synthesis","summary":" We propose a physics-based method for synthesizing dexterous hand-object\ninteractions in a full-body setting. While recent advancements have addressed\nspecific facets of human-object interactions, a comprehensive physics-based\napproach remains a challenge. Existing methods often focus on isolated segments\nof the interaction process and rely on data-driven techniques that may result\nin artifacts. In contrast, our proposed method embraces reinforcement learning\n(RL) and physics simulation to mitigate the limitations of data-driven\napproaches. Through a hierarchical framework, we first learn skill priors for\nboth body and hand movements in a decoupled setting. The generic skill priors\nlearn to decode a latent skill embedding into the motion of the underlying\npart. A high-level policy then controls hand-object interactions in these\npretrained latent spaces, guided by task objectives of grasping and 3D target\ntrajectory following. It is trained using a novel reward function that combines\nan adversarial style term with a task reward, encouraging natural motions while\nfulfilling the task incentives. Our method successfully accomplishes the\ncomplete interaction task, from approaching an object to grasping and\nsubsequent manipulation. We compare our approach against kinematics-based\nbaselines and show that it leads to more physically plausible motions.\n","authors":["Jona Braun","Sammy Christen","Muhammed Kocabas","Emre Aksan","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2309.07907v1.pdf","comment":"Project page at https://eth-ait.github.io/phys-fullbody-grasp"},{"id":"http://arxiv.org/abs/2309.07906v1","updated":"2023-09-14T17:54:01Z","published":"2023-09-14T17:54:01Z","title":"Generative Image Dynamics","summary":" We present an approach to modeling an image-space prior on scene dynamics.\nOur prior is learned from a collection of motion trajectories extracted from\nreal video sequences containing natural, oscillating motion such as trees,\nflowers, candles, and clothes blowing in the wind. Given a single image, our\ntrained model uses a frequency-coordinated diffusion sampling process to\npredict a per-pixel long-term motion representation in the Fourier domain,\nwhich we call a neural stochastic motion texture. This representation can be\nconverted into dense motion trajectories that span an entire video. Along with\nan image-based rendering module, these trajectories can be used for a number of\ndownstream applications, such as turning still images into seamlessly looping\ndynamic videos, or allowing users to realistically interact with objects in\nreal pictures.\n","authors":["Zhengqi Li","Richard Tucker","Noah Snavely","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2309.07906v1.pdf","comment":"Project website: http://generative-dynamics.github.io"},{"id":"http://arxiv.org/abs/2309.07891v1","updated":"2023-09-14T17:42:08Z","published":"2023-09-14T17:42:08Z","title":"HandNeRF: Learning to Reconstruct Hand-Object Interaction Scene from a\n Single RGB Image","summary":" This paper presents a method to learn hand-object interaction prior for\nreconstructing a 3D hand-object scene from a single RGB image. The inference as\nwell as training-data generation for 3D hand-object scene reconstruction is\nchallenging due to the depth ambiguity of a single image and occlusions by the\nhand and object. We turn this challenge into an opportunity by utilizing the\nhand shape to constrain the possible relative configuration of the hand and\nobject geometry. We design a generalizable implicit function, HandNeRF, that\nexplicitly encodes the correlation of the 3D hand shape features and 2D object\nfeatures to predict the hand and object scene geometry. With experiments on\nreal-world datasets, we show that HandNeRF is able to reconstruct hand-object\nscenes of novel grasp configurations more accurately than comparable methods.\nMoreover, we demonstrate that object reconstruction from HandNeRF ensures more\naccurate execution of a downstream task, such as grasping for robotic\nhand-over.\n","authors":["Hongsuk Choi","Nikhil Chavan-Dafle","Jiacheng Yuan","Volkan Isler","Hyunsoo Park"],"pdf_url":"https://arxiv.org/pdf/2309.07891v1.pdf","comment":"9 pages, 4 tables, 7 figures"},{"id":"http://arxiv.org/abs/2309.07888v1","updated":"2023-09-14T17:40:44Z","published":"2023-09-14T17:40:44Z","title":"A Novel Local-Global Feature Fusion Framework for Body-weight Exercise\n Recognition with Pressure Mapping Sensors","summary":" We present a novel local-global feature fusion framework for body-weight\nexercise recognition with floor-based dynamic pressure maps. One step further\nfrom the existing studies using deep neural networks mainly focusing on global\nfeature extraction, the proposed framework aims to combine local and global\nfeatures using image processing techniques and the YOLO object detection to\nlocalize pressure profiles from different body parts and consider physical\nconstraints. The proposed local feature extraction method generates two sets of\nhigh-level local features consisting of cropped pressure mapping and numerical\nfeatures such as angular orientation, location on the mat, and pressure area.\nIn addition, we adopt a knowledge distillation for regularization to preserve\nthe knowledge of the global feature extraction and improve the performance of\nthe exercise recognition. Our experimental results demonstrate a notable 11\npercent improvement in F1 score for exercise recognition while preserving\nlabel-specific features.\n","authors":["Davinder Pal Singh","Lala Shakti Swarup Ray","Bo Zhou","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2309.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07880v1","updated":"2023-09-14T17:25:25Z","published":"2023-09-14T17:25:25Z","title":"mEBAL2 Database and Benchmark: Image-based Multispectral Eyeblink\n Detection","summary":" This work introduces a new multispectral database and novel approaches for\neyeblink detection in RGB and Near-Infrared (NIR) individual images. Our\ncontributed dataset (mEBAL2, multimodal Eye Blink and Attention Level\nestimation, Version 2) is the largest existing eyeblink database, representing\na great opportunity to improve data-driven multispectral approaches for blink\ndetection and related applications (e.g., attention level estimation and\npresentation attack detection in face biometrics). mEBAL2 includes 21,100 image\nsequences from 180 different students (more than 2 million labeled images in\ntotal) while conducting a number of e-learning tasks of varying difficulty or\ntaking a real course on HTML initiation through the edX MOOC platform. mEBAL2\nuses multiple sensors, including two Near-Infrared (NIR) and one RGB camera to\ncapture facial gestures during the execution of the tasks, as well as an\nElectroencephalogram (EEG) band to get the cognitive activity of the user and\nblinking events. Furthermore, this work proposes a Convolutional Neural Network\narchitecture as benchmark for blink detection on mEBAL2 with performances up to\n97%. Different training methodologies are implemented using the RGB spectrum,\nNIR spectrum, and the combination of both to enhance the performance on\nexisting eyeblink detectors. We demonstrate that combining NIR and RGB images\nduring training improves the performance of RGB eyeblink detectors (i.e.,\ndetection based only on a RGB image). Finally, the generalization capacity of\nthe proposed eyeblink detectors is validated in wilder and more challenging\nenvironments like the HUST-LEBW dataset to show the usefulness of mEBAL2 to\ntrain a new generation of data-driven approaches for eyeblink detection.\n","authors":["Roberto Daza","Aythami Morales","Julian Fierrez","Ruben Tolosana","Ruben Vera-Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2309.07880v1.pdf","comment":"This paper is under consideration at Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2309.07878v1","updated":"2023-09-14T17:24:38Z","published":"2023-09-14T17:24:38Z","title":"Using network metrics to explore the community structure that underlies\n movement patterns","summary":" This work aims to explore the community structure of Santiago de Chile by\nanalyzing the movement patterns of its residents. We use a dataset containing\nthe approximate locations of home and work places for a subset of anonymized\nresidents to construct a network that represents the movement patterns within\nthe city. Through the analysis of this network, we aim to identify the\ncommunities or sub-cities that exist within Santiago de Chile and gain insights\ninto the factors that drive the spatial organization of the city. We employ\nmodularity optimization algorithms and clustering techniques to identify the\ncommunities within the network. Our results present that the novelty of\ncombining community detection algorithms with segregation tools provides new\ninsights to further the understanding of the complex geography of segregation\nduring working hours.\n","authors":["Anh Pham Thi Minh","Abhishek Kumar Singh","Soumya Snigdha Kundu"],"pdf_url":"https://arxiv.org/pdf/2309.07878v1.pdf","comment":"6 pages excluding References"},{"id":"http://arxiv.org/abs/2309.07866v1","updated":"2023-09-14T17:13:54Z","published":"2023-09-14T17:13:54Z","title":"Gradient constrained sharpness-aware prompt learning for vision-language\n models","summary":" This paper targets a novel trade-off problem in generalizable prompt learning\nfor vision-language models (VLM), i.e., improving the performance on unseen\nclasses while maintaining the performance on seen classes. Comparing with\nexisting generalizable methods that neglect the seen classes degradation, the\nsetting of this problem is more strict and fits more closely with practical\napplications. To solve this problem, we start from the optimization\nperspective, and leverage the relationship between loss landscape geometry and\nmodel generalization ability. By analyzing the loss landscape of the\nstate-of-the-art method and the widely-used Sharpness-aware Minimization (SAM),\nwe conclude that the trade-off performance correlates to both loss value and\nloss sharpness, while each of them are indispensable. However, we find the\noptimizing gradient of existing methods cannot always maintain high consistency\nwith both loss value and loss sharpness during the whole optimization\nprocedure. To this end, we propose an novel SAM-based method for prompt\nlearning, denoted as Gradient Constrained Sharpness-aware Context Optimization\n(GCSCoOp), to dynamically constrains the optimizing gradient, thus achieving\nabove two-fold optimization objective simultaneously. Extensive experiments\nverify the effectiveness of GCSCoOp in the trade-off problem.\n","authors":["Liangchen Liu","Nannan Wang","Dawei Zhou","Xinbo Gao","Decheng Liu","Xi Yang","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2309.07866v1.pdf","comment":"19 pages 11 figures"},{"id":"http://arxiv.org/abs/2308.12966v2","updated":"2023-09-14T17:08:39Z","published":"2023-08-24T17:59:17Z","title":"Qwen-VL: A Versatile Vision-Language Model for Understanding,\n Localization, Text Reading, and Beyond","summary":" We introduce the Qwen-VL series, a set of large-scale vision-language models\n(LVLMs) designed to perceive and understand both text and images. Comprising\nQwen-VL and Qwen-VL-Chat, these models exhibit remarkable performance in tasks\nlike image captioning, question answering, visual localization, and flexible\ninteraction. The evaluation covers a wide range of tasks including zero-shot\ncaptioning, visual or document visual question answering, and grounding. We\ndemonstrate the Qwen-VL outperforms existing LVLMs. We present their\narchitecture, training, capabilities, and performance, highlighting their\ncontributions to advancing multimodal artificial intelligence. Code, demo and\nmodels are available at https://github.com/QwenLM/Qwen-VL.\n","authors":["Jinze Bai","Shuai Bai","Shusheng Yang","Shijie Wang","Sinan Tan","Peng Wang","Junyang Lin","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.12966v2.pdf","comment":"Code, demo and models are available at\n https://github.com/QwenLM/Qwen-VL"},{"id":"http://arxiv.org/abs/2309.07849v1","updated":"2023-09-14T16:48:31Z","published":"2023-09-14T16:48:31Z","title":"TFNet: Exploiting Temporal Cues for Fast and Accurate LiDAR Semantic\n Segmentation","summary":" LiDAR semantic segmentation plays a crucial role in enabling autonomous\ndriving and robots to understand their surroundings accurately and robustly.\nThere are different types of methods, such as point-based, range image-based,\nand polar-based. Among these, range image-based methods are widely used due to\ntheir balance between accuracy and speed. However, they face a significant\nchallenge known as the ``many-to-one'' problem caused by the range image's\nlimited horizontal and vertical angular resolution, where around 20% of the 3D\npoints are occluded during model inference based on our observation. In this\npaper, we present TFNet, a range image-based LiDAR semantic segmentation method\nthat utilizes temporal information to address this issue. Specifically, we\nincorporate a temporal fusion layer to extract useful information from previous\nscans and integrate it with the current scan. We then design a max-voting-based\npost-processing technique to correct false predictions, particularly those\ncaused by the ``many-to-one'' issue. Experiments on two benchmarks and seven\nbackbones of three modalities demonstrate the effectiveness and scalability of\nour proposed method.\n","authors":["Rong Li","ShiJie Li","Xieyuanli Chen","Teli Ma","Wang Hao","Juergen Gall","Junwei Liang"],"pdf_url":"https://arxiv.org/pdf/2309.07849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07846v1","updated":"2023-09-14T16:40:44Z","published":"2023-09-14T16:40:44Z","title":"MC-NeRF: Muti-Camera Neural Radiance Fields for Muti-Camera Image\n Acquisition Systems","summary":" Neural Radiance Fields (NeRF) employ multi-view images for 3D scene\nrepresentation and have shown remarkable performance. As one of the primary\nsources of multi-view images, multi-camera systems encounter challenges such as\nvarying intrinsic parameters and frequent pose changes. Most previous\nNeRF-based methods often assume a global unique camera and seldom consider\nscenarios with multiple cameras. Besides, some pose-robust methods still remain\nsusceptible to suboptimal solutions when poses are poor initialized. In this\npaper, we propose MC-NeRF, a method can jointly optimize both intrinsic and\nextrinsic parameters for bundle-adjusting Neural Radiance Fields. Firstly, we\nconduct a theoretical analysis to tackle the degenerate case and coupling issue\nthat arise from the joint optimization between intrinsic and extrinsic\nparameters. Secondly, based on the proposed solutions, we introduce an\nefficient calibration image acquisition scheme for multi-camera systems,\nincluding the design of calibration object. Lastly, we present a global\nend-to-end network with training sequence that enables the regression of\nintrinsic and extrinsic parameters, along with the rendering network. Moreover,\nmost existing datasets are designed for unique camera, we create a new dataset\nthat includes four different styles of multi-camera acquisition systems,\nallowing readers to generate custom datasets. Experiments confirm the\neffectiveness of our method when each image corresponds to different camera\nparameters. Specifically, we adopt up to 110 images with 110 different\nintrinsic and extrinsic parameters, to achieve 3D scene representation without\nproviding initial poses. The Code and supplementary materials are available at\nhttps://in2-viaun.github.io/MC-NeRF.\n","authors":["Yu Gao","Lutong Su","Hao Liang","Yufeng Yue","Yi Yang","Mengyin Fu"],"pdf_url":"https://arxiv.org/pdf/2309.07846v1.pdf","comment":"This manuscript is currently under review"},{"id":"http://arxiv.org/abs/2309.07823v1","updated":"2023-09-14T16:16:57Z","published":"2023-09-14T16:16:57Z","title":"Large-scale Weakly Supervised Learning for Road Extraction from\n Satellite Imagery","summary":" Automatic road extraction from satellite imagery using deep learning is a\nviable alternative to traditional manual mapping. Therefore it has received\nconsiderable attention recently. However, most of the existing methods are\nsupervised and require pixel-level labeling, which is tedious and error-prone.\nTo make matters worse, the earth has a diverse range of terrain, vegetation,\nand man-made objects. It is well known that models trained in one area\ngeneralize poorly to other areas. Various shooting conditions such as light and\nangel, as well as different image processing techniques further complicate the\nissue. It is impractical to develop training data to cover all image styles.\nThis paper proposes to leverage OpenStreetMap road data as weak labels and\nlarge scale satellite imagery to pre-train semantic segmentation models. Our\nextensive experimental results show that the prediction accuracy increases with\nthe amount of the weakly labeled data, as well as the road density in the areas\nchosen for training. Using as much as 100 times more data than the widely used\nDeepGlobe road dataset, our model with the D-LinkNet architecture and the\nResNet-50 backbone exceeds the top performer of the current DeepGlobe\nleaderboard. Furthermore, due to large-scale pre-training, our model\ngeneralizes much better than those trained with only the curated datasets,\nimplying great application potential.\n","authors":["Shiqiao Meng","Zonglin Di","Siwei Yang","Yin Wang"],"pdf_url":"https://arxiv.org/pdf/2309.07823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07819v1","updated":"2023-09-14T16:14:38Z","published":"2023-09-14T16:14:38Z","title":"Decomposition of linear tensor transformations","summary":" One of the main issues in computing a tensor decomposition is how to choose\nthe number of rank-one components, since there is no finite algorithms for\ndetermining the rank of a tensor. A commonly used approach for this purpose is\nto find a low-dimensional subspace by solving an optimization problem and\nassuming the number of components is fixed. However, even though this algorithm\nis efficient and easy to implement, it often converges to poor local minima and\nsuffers from outliers and noise. The aim of this paper is to develop a\nmathematical framework for exact tensor decomposition that is able to represent\na tensor as the sum of a finite number of low-rank tensors. In the paper three\ndifferent problems will be carried out to derive: i) the decomposition of a\nnon-negative self-adjoint tensor operator; ii) the decomposition of a linear\ntensor transformation; iii) the decomposition of a generic tensor.\n","authors":["Claudio Turchetti"],"pdf_url":"https://arxiv.org/pdf/2309.07819v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2305.02803"},{"id":"http://arxiv.org/abs/2205.11110v2","updated":"2023-09-14T16:08:10Z","published":"2022-05-23T07:58:50Z","title":"Meta-Learning Regrasping Strategies for Physical-Agnostic Objects","summary":" Grasping inhomogeneous objects in real-world applications remains a\nchallenging task due to the unknown physical properties such as mass\ndistribution and coefficient of friction. In this study, we propose a\nmeta-learning algorithm called ConDex, which incorporates Conditional Neural\nProcesses (CNP) with DexNet-2.0 to autonomously discern the underlying physical\nproperties of objects using depth images. ConDex efficiently acquires physical\nembeddings from limited trials, enabling precise grasping point estimation.\nFurthermore, ConDex is capable of updating the predicted grasping quality\niteratively from new trials in an online fashion. To the best of our knowledge,\nwe are the first who generate two object datasets focusing on inhomogeneous\nphysical properties with varying mass distributions and friction coefficients.\nExtensive evaluations in simulation demonstrate ConDex's superior performance\nover DexNet-2.0 and existing meta-learning-based grasping pipelines.\nFurthermore, ConDex shows robust generalization to previously unseen real-world\nobjects despite training solely in the simulation. The synthetic and real-world\ndatasets will be published as well.\n","authors":["Ning Gao","Jingyu Zhang","Ruijie Chen","Ngo Anh Vien","Hanna Ziesche","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2205.11110v2.pdf","comment":"Accepted as spotlight in ICRA 2022 Workshop: Scaling Robot Learning"},{"id":"http://arxiv.org/abs/2304.01029v2","updated":"2023-09-14T16:05:46Z","published":"2023-04-03T14:28:29Z","title":"Domain Generalization for Crop Segmentation with Knowledge Distillation","summary":" In recent years, precision agriculture has gradually oriented farming closer\nto automation processes to support all the activities related to field\nmanagement. Service robotics plays a predominant role in this evolution by\ndeploying autonomous agents that can navigate fields while performing tasks\nwithout human intervention, such as monitoring, spraying, and harvesting. To\nexecute these precise actions, mobile robots need a real-time perception system\nthat understands their surroundings and identifies their targets in the wild.\nGeneralizing to new crops and environmental conditions is critical for\npractical applications, as labeled samples are rarely available. In this paper,\nwe investigate the problem of crop segmentation and propose a novel approach to\nenhance domain generalization using knowledge distillation. In the proposed\nframework, we transfer knowledge from an ensemble of models individually\ntrained on source domains to a student model that can adapt to unseen target\ndomains. To evaluate the proposed method, we present a synthetic multi-domain\ndataset for crop segmentation containing plants of variegate shapes and\ncovering different terrain styles, weather conditions, and light scenarios for\nmore than 50,000 samples. We demonstrate significant improvements in\nperformance over state-of-the-art methods and superior sim-to-real\ngeneralization. Our approach provides a promising solution for domain\ngeneralization in crop segmentation and has the potential to enhance a wide\nvariety of precision agriculture applications.\n","authors":["Simone Angarano","Mauro Martini","Alessandro Navone","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2304.01029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07808v1","updated":"2023-09-14T15:54:56Z","published":"2023-09-14T15:54:56Z","title":"What Matters to Enhance Traffic Rule Compliance of Imitation Learning\n for Automated Driving","summary":" More research attention has recently been given to end-to-end autonomous\ndriving technologies where the entire driving pipeline is replaced with a\nsingle neural network because of its simpler structure and faster inference\ntime. Despite this appealing approach largely reducing the components in\ndriving pipeline, its simplicity also leads to interpretability problems and\nsafety issues arXiv:2003.06404. The trained policy is not always compliant with\nthe traffic rules and it is also hard to discover the reason for the\nmisbehavior because of the lack of intermediate outputs. Meanwhile, Sensors are\nalso critical to autonomous driving's security and feasibility to perceive the\nsurrounding environment under complex driving scenarios. In this paper, we\nproposed P-CSG, a novel penalty-based imitation learning approach with cross\nsemantics generation sensor fusion technologies to increase the overall\nperformance of End-to-End Autonomous Driving. We conducted an assessment of our\nmodel's performance using the Town 05 Long benchmark, achieving an impressive\ndriving score improvement of over 15%. Furthermore, we conducted robustness\nevaluations against adversarial attacks like FGSM and Dot attacks, revealing a\nsubstantial increase in robustness compared to baseline models.More detailed\ninformation, such as code-based resources, ablation studies and videos can be\nfound at https://hk-zh.github.io/p-csg-plus.\n","authors":["Hongkuan Zhou","Aifen Sui","Wei Cao","Letian Shi"],"pdf_url":"https://arxiv.org/pdf/2309.07808v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.19862v3","updated":"2023-09-14T15:39:30Z","published":"2023-05-31T13:55:00Z","title":"Self-supervised Learning to Bring Dual Reversed Rolling Shutter Images\n Alive","summary":" Modern consumer cameras usually employ the rolling shutter (RS) mechanism,\nwhere images are captured by scanning scenes row-by-row, yielding RS\ndistortions for dynamic scenes. To correct RS distortions, existing methods\nadopt a fully supervised learning manner, where high framerate global shutter\n(GS) images should be collected as ground-truth supervision. In this paper, we\npropose a Self-supervised learning framework for Dual reversed RS distortions\nCorrection (SelfDRSC), where a DRSC network can be learned to generate a high\nframerate GS video only based on dual RS images with reversed distortions. In\nparticular, a bidirectional distortion warping module is proposed for\nreconstructing dual reversed RS images, and then a self-supervised loss can be\ndeployed to train DRSC network by enhancing the cycle consistency between input\nand reconstructed dual reversed RS images. Besides start and end RS scanning\ntime, GS images at arbitrary intermediate scanning time can also be supervised\nin SelfDRSC, thus enabling the learned DRSC network to generate a high\nframerate GS video. Moreover, a simple yet effective self-distillation strategy\nis introduced in self-supervised loss for mitigating boundary artifacts in\ngenerated GS images. On synthetic dataset, SelfDRSC achieves better or\ncomparable quantitative metrics in comparison to state-of-the-art methods\ntrained in the full supervision manner. On real-world RS cases, our SelfDRSC\ncan produce high framerate GS videos with finer correction textures and better\ntemporary consistency. The source code and trained models are made publicly\navailable at https://github.com/shangwei5/SelfDRSC. We also provide an\nimplementation in HUAWEI Mindspore at\nhttps://github.com/Hunter-Will/SelfDRSC-mindspore.\n","authors":["Wei Shang","Dongwei Ren","Chaoyu Feng","Xiaotao Wang","Lei Lei","Wangmeng Zuo"],"pdf_url":"https://arxiv.org/pdf/2305.19862v3.pdf","comment":"Accepted by ICCV 2023, available at\n https://github.com/shangwei5/SelfDRSC"},{"id":"http://arxiv.org/abs/2309.07796v1","updated":"2023-09-14T15:35:08Z","published":"2023-09-14T15:35:08Z","title":"For A More Comprehensive Evaluation of 6DoF Object Pose Tracking","summary":" Previous evaluations on 6DoF object pose tracking have presented obvious\nlimitations along with the development of this area. In particular, the\nevaluation protocols are not unified for different methods, the widely-used\nYCBV dataset contains significant annotation error, and the error metrics also\nmay be biased. As a result, it is hard to fairly compare the methods, which has\nbecame a big obstacle for developing new algorithms. In this paper we\ncontribute a unified benchmark to address the above problems. For more accurate\nannotation of YCBV, we propose a multi-view multi-object global pose refinement\nmethod, which can jointly refine the poses of all objects and view cameras,\nresulting in sub-pixel sub-millimeter alignment errors. The limitations of\nprevious scoring methods and error metrics are analyzed, based on which we\nintroduce our improved evaluation methods. The unified benchmark takes both\nYCBV and BCOT as base datasets, which are shown to be complementary in scene\ncategories. In experiments, we validate the precision and reliability of the\nproposed global pose refinement method with a realistic semi-synthesized\ndataset particularly for YCBV, and then present the benchmark results unifying\nlearning&non-learning and RGB&RGBD methods, with some finds not discovered in\nprevious studies.\n","authors":["Yang Li","Fan Zhong","Xin Wang","Shuangbing Song","Jiachen Li","Xueying Qin","Changhe Tu"],"pdf_url":"https://arxiv.org/pdf/2309.07796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08518v3","updated":"2023-09-14T15:23:48Z","published":"2023-08-16T17:13:45Z","title":"Exploiting Point-Wise Attention in 6D Object Pose Estimation Based on\n Bidirectional Prediction","summary":" Traditional geometric registration based estimation methods only exploit the\nCAD model implicitly, which leads to their dependence on observation quality\nand deficiency to occlusion. To address the problem,the paper proposes a\nbidirectional correspondence prediction network with a point-wise\nattention-aware mechanism. This network not only requires the model points to\npredict the correspondence but also explicitly models the geometric\nsimilarities between observations and the model prior. Our key insight is that\nthe correlations between each model point and scene point provide essential\ninformation for learning point-pair matches. To further tackle the correlation\nnoises brought by feature distribution divergence, we design a simple but\neffective pseudo-siamese network to improve feature homogeneity. Experimental\nresults on the public datasets of LineMOD, YCB-Video, and Occ-LineMOD show that\nthe proposed method achieves better performance than other state-of-the-art\nmethods under the same evaluation criteria. Its robustness in estimating poses\nis greatly improved, especially in an environment with severe occlusions.\n","authors":["Yuhao Yang","Jun Wu","Yue Wang","Guangjian Zhang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2308.08518v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07778v1","updated":"2023-09-14T15:09:35Z","published":"2023-09-14T15:09:35Z","title":"Virchow: A Million-Slide Digital Pathology Foundation Model","summary":" Computational pathology uses artificial intelligence to enable precision\nmedicine and decision support systems through the analysis of whole slide\nimages. It has the potential to revolutionize the diagnosis and treatment of\ncancer. However, a major challenge to this objective is that for many specific\ncomputational pathology tasks the amount of data is inadequate for development.\nTo address this challenge, we created Virchow, a 632 million parameter deep\nneural network foundation model for computational pathology. Using\nself-supervised learning, Virchow is trained on 1.5 million hematoxylin and\neosin stained whole slide images from diverse tissue groups, which is orders of\nmagnitude more data than previous works. When evaluated on downstream tasks\nincluding tile-level pan-cancer detection and subtyping and slide-level\nbiomarker prediction, Virchow outperforms state-of-the-art systems both on\ninternal datasets drawn from the same population as the pretraining data as\nwell as external public datasets. Virchow achieves 93% balanced accuracy for\npancancer tile classification, and AUCs of 0.983 for colon microsatellite\ninstability status prediction and 0.967 for breast CDH1 status prediction. The\ngains in performance highlight the importance of pretraining on massive\npathology image datasets, suggesting pretraining on even larger datasets could\ncontinue improving performance for many high-impact applications where limited\namounts of training data are available, such as drug outcome prediction.\n","authors":["Eugene Vorontsov","Alican Bozkurt","Adam Casson","George Shaikovski","Michal Zelechowski","Siqi Liu","Philippe Mathieu","Alexander van Eck","Donghun Lee","Julian Viret","Eric Robert","Yi Kan Wang","Jeremy D. Kun","Matthew C. H. Le","Jan Bernhard","Ran A. Godrich","Gerard Oakley","Ewan Millar","Matthew Hanna","Juan Retamero","William A. Moye","Razik Yousfi","Christopher Kanan","David Klimstra","Brandon Rothrock","Thomas J. Fuchs"],"pdf_url":"https://arxiv.org/pdf/2309.07778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13437v2","updated":"2023-09-14T15:00:06Z","published":"2023-08-25T15:33:47Z","title":"Position-Enhanced Visual Instruction Tuning for Multimodal Large\n Language Models","summary":" Recently, Multimodal Large Language Models (MLLMs) that enable Large Language\nModels (LLMs) to interpret images through visual instruction tuning have\nachieved significant success. However, existing visual instruction tuning\nmethods only utilize image-language instruction data to align the language and\nimage modalities, lacking a more fine-grained cross-modal alignment. In this\npaper, we propose Position-enhanced Visual Instruction Tuning (PVIT), which\nextends the functionality of MLLMs by integrating an additional region-level\nvision encoder. This integration promotes a more detailed comprehension of\nimages for the MLLM. In addition, to efficiently achieve a fine-grained\nalignment between the vision modules and the LLM, we design multiple data\ngeneration strategies to construct an image-region-language instruction\ndataset. Finally, we present both quantitative experiments and qualitative\nanalysis that demonstrate the superiority of the proposed model. Code and data\nwill be released at https://github.com/PVIT-official/PVIT.\n","authors":["Chi Chen","Ruoyu Qin","Fuwen Luo","Xiaoyue Mi","Peng Li","Maosong Sun","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2308.13437v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07760v1","updated":"2023-09-14T14:48:01Z","published":"2023-09-14T14:48:01Z","title":"PRE: Vision-Language Prompt Learning with Reparameterization Encoder","summary":" Large pre-trained vision-language models such as CLIP have demonstrated great\npotential in zero-shot transferability to downstream tasks. However, to attain\noptimal performance, the manual selection of prompts is necessary to improve\nalignment between the downstream image distribution and the textual class\ndescriptions. This manual prompt engineering is the major challenge for\ndeploying such models in practice since it requires domain expertise and is\nextremely time-consuming. To avoid non-trivial prompt engineering, recent work\nContext Optimization (CoOp) introduced the concept of prompt learning to the\nvision domain using learnable textual tokens. While CoOp can achieve\nsubstantial improvements over manual prompts, its learned context is worse\ngeneralizable to wider unseen classes within the same dataset. In this work, we\npresent Prompt Learning with Reparameterization Encoder (PRE) - a simple and\nefficient method that enhances the generalization ability of the learnable\nprompt to unseen classes while maintaining the capacity to learn Base classes.\nInstead of directly optimizing the prompts, PRE employs a prompt encoder to\nreparameterize the input prompt embeddings, enhancing the exploration of\ntask-specific knowledge from few-shot samples. Experiments and extensive\nablation studies on 8 benchmarks demonstrate that our approach is an efficient\nmethod for prompt learning. Specifically, PRE achieves a notable enhancement of\n5.60% in average accuracy on New classes and 3% in Harmonic mean compared to\nCoOp in the 16-shot setting, all achieved within a good training time.\n","authors":["Anh Pham Thi Minh"],"pdf_url":"https://arxiv.org/pdf/2309.07760v1.pdf","comment":"8 pages excluding References and Appendix"},{"id":"http://arxiv.org/abs/2208.00085v3","updated":"2023-09-14T14:45:30Z","published":"2022-07-29T21:56:59Z","title":"Machine Learning and Computer Vision Techniques in Continuous Beehive\n Monitoring Applications: A survey","summary":" Wide use and availability of the machine learning and computer vision\ntechniques allows development of relatively complex monitoring systems in many\ndomains. Besides the traditional industrial domain, new application appears\nalso in biology and agriculture, where we could speak about the detection of\ninfections, parasites and weeds, but also about automated monitoring and early\nwarning systems. This is also connected with the introduction of the easily\naccessible hardware and development kits such as Arduino, or RaspberryPi\nfamily. In this paper, we survey 50 existing papers focusing on the methods of\nautomated beehive monitoring methods using the computer vision techniques,\nparticularly on the pollen and Varroa mite detection together with the bee\ntraffic monitoring. Such systems could also be used for the monitoring of the\nhoneybee colonies and for the inspection of their health state, which could\nidentify potentially dangerous states before the situation is critical, or to\nbetter plan periodic bee colony inspections and therefore save significant\ncosts. Later, we also include analysis of the research trends in this\napplication field and we outline the possible direction of the new\nexplorations. Our paper is aimed also at veterinary and apidology professionals\nand experts, who might not be familiar with machine learning to introduce them\nto its possibilities, therefore each family of applications is opened by a\nbrief theoretical introduction and motivation related to its base method. We\nhope that this paper will inspire other scientists to use machine learning\ntechniques for other applications in beehive monitoring.\n","authors":["Simon Bilik","Tomas Zemcik","Lukas Kratochvila","Dominik Ricanek","Milos Richter","Sebastian Zambanini","Karel Horak"],"pdf_url":"https://arxiv.org/pdf/2208.00085v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14565v2","updated":"2023-09-14T14:41:52Z","published":"2023-06-26T10:26:33Z","title":"Mitigating Hallucination in Large Multi-Modal Models via Robust\n Instruction Tuning","summary":" Despite the promising progress in multi-modal tasks, current large\nmulti-modal models (LMM) are prone to hallucinating inconsistent descriptions\nwith respect to the associated image and human instructions. This paper\naddresses this issue by introducing the first large and diverse visual\ninstruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction.\nOur dataset consists of 120k visual instructions generated by GPT4, covering 16\nvision-and-language tasks with open-ended instructions and answers. Unlike\nexisting studies that primarily focus on positive instruction samples, we\ndesign LRV-Instruction to include both positive and negative instructions for\nmore robust visual instruction tuning. Our negative instructions are designed\nat two semantic levels: (i) Nonexistent Element Manipulation and (ii) Existent\nElement Manipulation. To efficiently measure the hallucination generated by\nLMMs, we propose GPT4-Assisted Visual Instruction Evaluation (GAVIE), a novel\napproach to evaluate visual instruction tuning without the need for\nhuman-annotated groundtruth answers and can adapt to diverse instruction\nformats. We conduct comprehensive experiments to investigate the hallucination\nof LMMs. Our results demonstrate that existing LMMs exhibit significant\nhallucination when presented with our negative instructions, particularly with\nExistent Element Manipulation instructions. Moreover, by finetuning MiniGPT4 on\nLRV-Instruction, we successfully mitigate hallucination while improving\nperformance on public datasets using less training data compared to\nstate-of-the-art methods. Additionally, we observed that a balanced ratio of\npositive and negative instances in the training data leads to a more robust\nmodel. Updates of our project are available at\nhttps://fuxiaoliu.github.io/LRV/.\n","authors":["Fuxiao Liu","Kevin Lin","Linjie Li","Jianfeng Wang","Yaser Yacoob","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14565v2.pdf","comment":"35 pages, 27 figures. Under Review"},{"id":"http://arxiv.org/abs/2309.07753v1","updated":"2023-09-14T14:39:07Z","published":"2023-09-14T14:39:07Z","title":"Co-Salient Object Detection with Semantic-Level Consensus Extraction and\n Dispersion","summary":" Given a group of images, co-salient object detection (CoSOD) aims to\nhighlight the common salient object in each image. There are two factors\nclosely related to the success of this task, namely consensus extraction, and\nthe dispersion of consensus to each image. Most previous works represent the\ngroup consensus using local features, while we instead utilize a hierarchical\nTransformer module for extracting semantic-level consensus. Therefore, it can\nobtain a more comprehensive representation of the common object category, and\nexclude interference from other objects that share local similarities with the\ntarget object. In addition, we propose a Transformer-based dispersion module\nthat takes into account the variation of the co-salient object in different\nscenes. It distributes the consensus to the image feature maps in an\nimage-specific way while making full use of interactions within the group.\nThese two modules are integrated with a ViT encoder and an FPN-like decoder to\nform an end-to-end trainable network, without additional branch and auxiliary\nloss. The proposed method is evaluated on three commonly used CoSOD datasets\nand achieves state-of-the-art performance.\n","authors":["Peiran Xu","Yadong Mu"],"pdf_url":"https://arxiv.org/pdf/2309.07753v1.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2309.07752v1","updated":"2023-09-14T14:39:05Z","published":"2023-09-14T14:39:05Z","title":"DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields for\n High-Fidelity Talking Portrait Synthesis","summary":" In this paper, we present the decomposed triplane-hash neural radiance fields\n(DT-NeRF), a framework that significantly improves the photorealistic rendering\nof talking faces and achieves state-of-the-art results on key evaluation\ndatasets. Our architecture decomposes the facial region into two specialized\ntriplanes: one specialized for representing the mouth, and the other for the\nbroader facial features. We introduce audio features as residual terms and\nintegrate them as query vectors into our model through an audio-mouth-face\ntransformer. Additionally, our method leverages the capabilities of Neural\nRadiance Fields (NeRF) to enrich the volumetric representation of the entire\nface through additive volumetric rendering techniques. Comprehensive\nexperimental evaluations corroborate the effectiveness and superiority of our\nproposed approach.\n","authors":["Yaoyu Su","Shaohui Wang","Haoqian Wang"],"pdf_url":"https://arxiv.org/pdf/2309.07752v1.pdf","comment":"5 pages, 5 figures. Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.07749v1","updated":"2023-09-14T14:36:22Z","published":"2023-09-14T14:36:22Z","title":"OmnimatteRF: Robust Omnimatte with 3D Background Modeling","summary":" Video matting has broad applications, from adding interesting effects to\ncasually captured movies to assisting video production professionals. Matting\nwith associated effects such as shadows and reflections has also attracted\nincreasing research activity, and methods like Omnimatte have been proposed to\nseparate dynamic foreground objects of interest into their own layers. However,\nprior works represent video backgrounds as 2D image layers, limiting their\ncapacity to express more complicated scenes, thus hindering application to\nreal-world videos. In this paper, we propose a novel video matting method,\nOmnimatteRF, that combines dynamic 2D foreground layers and a 3D background\nmodel. The 2D layers preserve the details of the subjects, while the 3D\nbackground robustly reconstructs scenes in real-world videos. Extensive\nexperiments demonstrate that our method reconstructs scenes with better quality\non various videos.\n","authors":["Geng Lin","Chen Gao","Jia-Bin Huang","Changil Kim","Yipeng Wang","Matthias Zwicker","Ayush Saraf"],"pdf_url":"https://arxiv.org/pdf/2309.07749v1.pdf","comment":"ICCV 2023. Project page: https://omnimatte-rf.github.io/"},{"id":"http://arxiv.org/abs/2309.05446v2","updated":"2023-09-14T14:30:04Z","published":"2023-09-11T13:39:15Z","title":"A Localization-to-Segmentation Framework for Automatic Tumor\n Segmentation in Whole-Body PET/CT Images","summary":" Fluorodeoxyglucose (FDG) positron emission tomography (PET) combined with\ncomputed tomography (CT) is considered the primary solution for detecting some\ncancers, such as lung cancer and melanoma. Automatic segmentation of tumors in\nPET/CT images can help reduce doctors' workload, thereby improving diagnostic\nquality. However, precise tumor segmentation is challenging due to the small\nsize of many tumors and the similarity of high-uptake normal areas to the tumor\nregions. To address these issues, this paper proposes a\nlocalization-to-segmentation framework (L2SNet) for precise tumor segmentation.\nL2SNet first localizes the possible lesions in the lesion localization phase\nand then uses the location cues to shape the segmentation results in the lesion\nsegmentation phase. To further improve the segmentation performance of L2SNet,\nwe design an adaptive threshold scheme that takes the segmentation results of\nthe two phases into consideration. The experiments with the MICCAI 2023\nAutomated Lesion Segmentation in Whole-Body FDG-PET/CT challenge dataset show\nthat our method achieved a competitive result and was ranked in the top 7\nmethods on the preliminary test set. Our work is available at:\nhttps://github.com/MedCAI/L2SNet.\n","authors":["Linghan Cai","Jianhao Huang","Zihang Zhu","Jinpeng Lu","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.05446v2.pdf","comment":"7 pages,3 figures"},{"id":"http://arxiv.org/abs/2309.00923v2","updated":"2023-09-14T14:05:02Z","published":"2023-09-02T12:07:21Z","title":"GBE-MLZSL: A Group Bi-Enhancement Framework for Multi-Label Zero-Shot\n Learning","summary":" This paper investigates a challenging problem of zero-shot learning in the\nmulti-label scenario (MLZSL), wherein, the model is trained to recognize\nmultiple unseen classes within a sample (e.g., an image) based on seen classes\nand auxiliary knowledge, e.g., semantic information. Existing methods usually\nresort to analyzing the relationship of various seen classes residing in a\nsample from the dimension of spatial or semantic characteristics, and transfer\nthe learned model to unseen ones. But they ignore the effective integration of\nlocal and global features. That is, in the process of inferring unseen classes,\nglobal features represent the principal direction of the image in the feature\nspace, while local features should maintain uniqueness within a certain range.\nThis integrated neglect will make the model lose its grasp of the main\ncomponents of the image. Relying only on the local existence of seen classes\nduring the inference stage introduces unavoidable bias. In this paper, we\npropose a novel and effective group bi-enhancement framework for MLZSL, dubbed\nGBE-MLZSL, to fully make use of such properties and enable a more accurate and\nrobust visual-semantic projection. Specifically, we split the feature maps into\nseveral feature groups, of which each feature group can be trained\nindependently with the Local Information Distinguishing Module (LID) to ensure\nuniqueness. Meanwhile, a Global Enhancement Module (GEM) is designed to\npreserve the principal direction. Besides, a static graph structure is designed\nto construct the correlation of local features. Experiments on large-scale\nMLZSL benchmark datasets NUS-WIDE and Open-Images-v4 demonstrate that the\nproposed GBE-MLZSL outperforms other state-of-the-art methods with large\nmargins.\n","authors":["Ziming Liu","Jingcai Guo","Xiaocheng Lu","Song Guo","Peiran Dong","Jiewei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.00923v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2110.08797v2","updated":"2023-09-14T13:37:38Z","published":"2021-10-17T11:29:13Z","title":"Towards Language-guided Visual Recognition via Dynamic Convolutions","summary":" In this paper, we are committed to establishing an unified and end-to-end\nmulti-modal network via exploring the language-guided visual recognition. To\napproach this target, we first propose a novel multi-modal convolution module\ncalled Language-dependent Convolution (LaConv). Its convolution kernels are\ndynamically generated based on natural language information, which can help\nextract differentiated visual features for different multi-modal examples.\nBased on the LaConv module, we further build the first fully language-driven\nconvolution network, termed as LaConvNet, which can unify the visual\nrecognition and multi-modal reasoning in one forward structure. To validate\nLaConv and LaConvNet, we conduct extensive experiments on four benchmark\ndatasets of two vision-and-language tasks, i.e., visual question answering\n(VQA) and referring expression comprehension (REC). The experimental results\nnot only shows the performance gains of LaConv compared to the existing\nmulti-modal modules, but also witness the merits of LaConvNet as an unified\nnetwork, including compact network, high generalization ability and excellent\nperformance, e.g., +4.7% on RefCOCO+.\n","authors":["Gen Luo","Yiyi Zhou","Xiaoshuai Sun","Yongjian Wu","Yue Gao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2110.08797v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.07913v2","updated":"2023-09-14T13:33:43Z","published":"2022-04-17T03:04:03Z","title":"A Survivor in the Era of Large-Scale Pretraining: An Empirical Study of\n One-Stage Referring Expression Comprehension","summary":" Most of the existing work in one-stage referring expression comprehension\n(REC) mainly focuses on multi-modal fusion and reasoning, while the influence\nof other factors in this task lacks in-depth exploration. To fill this gap, we\nconduct an empirical study in this paper. Concretely, we first build a very\nsimple REC network called SimREC, and ablate 42 candidate designs/settings,\nwhich covers the entire process of one-stage REC from network design to model\ntraining. Afterwards, we conduct over 100 experimental trials on three\nbenchmark datasets of REC. The extensive experimental results not only show the\nkey factors that affect REC performance in addition to multi-modal fusion,\ne.g., multi-scale features and data augmentation, but also yield some findings\nthat run counter to conventional understanding. For example, as a vision and\nlanguage (V&L) task, REC does is less impacted by language prior. In addition,\nwith a proper combination of these findings, we can improve the performance of\nSimREC by a large margin, e.g., +27.12% on RefCOCO+, which outperforms all\nexisting REC methods. But the most encouraging finding is that with much less\ntraining overhead and parameters, SimREC can still achieve better performance\nthan a set of large-scale pre-trained models, e.g., UNITER and VILLA,\nportraying the special role of REC in existing V&L research.\n","authors":["Gen Luo","Yiyi Zhou","Jiamu Sun","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2204.07913v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07704v1","updated":"2023-09-14T13:29:41Z","published":"2023-09-14T13:29:41Z","title":"NutritionVerse: Empirical Study of Various Dietary Intake Estimation\n Approaches","summary":" Accurate dietary intake estimation is critical for informing policies and\nprograms to support healthy eating, as malnutrition has been directly linked to\ndecreased quality of life. However self-reporting methods such as food diaries\nsuffer from substantial bias. Other conventional dietary assessment techniques\nand emerging alternative approaches such as mobile applications incur high time\ncosts and may necessitate trained personnel. Recent work has focused on using\ncomputer vision and machine learning to automatically estimate dietary intake\nfrom food images, but the lack of comprehensive datasets with diverse\nviewpoints, modalities and food annotations hinders the accuracy and realism of\nsuch methods. To address this limitation, we introduce NutritionVerse-Synth,\nthe first large-scale dataset of 84,984 photorealistic synthetic 2D food images\nwith associated dietary information and multimodal annotations (including depth\nimages, instance masks, and semantic masks). Additionally, we collect a real\nimage dataset, NutritionVerse-Real, containing 889 images of 251 dishes to\nevaluate realism. Leveraging these novel datasets, we develop and benchmark\nNutritionVerse, an empirical study of various dietary intake estimation\napproaches, including indirect segmentation-based and direct prediction\nnetworks. We further fine-tune models pretrained on synthetic data with real\nimages to provide insights into the fusion of synthetic and real data. Finally,\nwe release both datasets (NutritionVerse-Synth, NutritionVerse-Real) on\nhttps://www.kaggle.com/nutritionverse/datasets as part of an open initiative to\naccelerate machine learning for dietary sensing.\n","authors":["Chi-en Amy Tai","Matthew Keller","Saeejith Nair","Yuhao Chen","Yifan Wu","Olivia Markham","Krish Parmar","Pengcheng Xi","Heather Keller","Sharon Kirkpatrick","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2309.07704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07698v1","updated":"2023-09-14T13:17:02Z","published":"2023-09-14T13:17:02Z","title":"Dataset Condensation via Generative Model","summary":" Dataset condensation aims to condense a large dataset with a lot of training\nsamples into a small set. Previous methods usually condense the dataset into\nthe pixels format. However, it suffers from slow optimization speed and large\nnumber of parameters to be optimized. When increasing image resolutions and\nclasses, the number of learnable parameters grows accordingly, prohibiting\ncondensation methods from scaling up to large datasets with diverse classes.\nMoreover, the relations among condensed samples have been neglected and hence\nthe feature distribution of condensed samples is often not diverse. To solve\nthese problems, we propose to condense the dataset into another format, a\ngenerative model. Such a novel format allows for the condensation of large\ndatasets because the size of the generative model remains relatively stable as\nthe number of classes or image resolution increases. Furthermore, an\nintra-class and an inter-class loss are proposed to model the relation of\ncondensed samples. Intra-class loss aims to create more diverse samples for\neach class by pushing each sample away from the others of the same class.\nMeanwhile, inter-class loss increases the discriminability of samples by\nwidening the gap between the centers of different classes. Extensive\ncomparisons with state-of-the-art methods and our ablation studies confirm the\neffectiveness of our method and its individual component. To our best\nknowledge, we are the first to successfully conduct condensation on\nImageNet-1k.\n","authors":["David Junhao Zhang","Heng Wang","Chuhui Xue","Rui Yan","Wenqing Zhang","Song Bai","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2309.07698v1.pdf","comment":"old work,done in 2022"},{"id":"http://arxiv.org/abs/2211.06660v2","updated":"2023-09-14T13:13:25Z","published":"2022-11-12T13:32:19Z","title":"Far Away in the Deep Space: Dense Nearest-Neighbor-Based\n Out-of-Distribution Detection","summary":" The key to out-of-distribution detection is density estimation of the\nin-distribution data or of its feature representations. This is particularly\nchallenging for dense anomaly detection in domains where the in-distribution\ndata has a complex underlying structure. Nearest-Neighbors approaches have been\nshown to work well in object-centric data domains, such as industrial\ninspection and image classification. In this paper, we show that\nnearest-neighbor approaches also yield state-of-the-art results on dense\nnovelty detection in complex driving scenes when working with an appropriate\nfeature representation. In particular, we find that transformer-based\narchitectures produce representations that yield much better similarity metrics\nfor the task. We identify the multi-head structure of these models as one of\nthe reasons, and demonstrate a way to transfer some of the improvements to\nCNNs. Ultimately, the approach is simple and non-invasive, i.e., it does not\naffect the primary segmentation performance, refrains from training on examples\nof anomalies, and achieves state-of-the-art results on RoadAnomaly,\nStreetHazards, and SegmentMeIfYouCan-Anomaly.\n","authors":["Silvio Galesso","Max Argus","Thomas Brox"],"pdf_url":"https://arxiv.org/pdf/2211.06660v2.pdf","comment":"Workshop on Uncertainty Quantification for Computer Vision, ICCV\n 2023. Code at: https://github.com/silviogalesso/dense-ood-knns"},{"id":"http://arxiv.org/abs/2309.07668v1","updated":"2023-09-14T12:30:48Z","published":"2023-09-14T12:30:48Z","title":"CoRF : Colorizing Radiance Fields using Knowledge Distillation","summary":" Neural radiance field (NeRF) based methods enable high-quality novel-view\nsynthesis for multi-view images. This work presents a method for synthesizing\ncolorized novel views from input grey-scale multi-view images. When we apply\nimage or video-based colorization methods on the generated grey-scale novel\nviews, we observe artifacts due to inconsistency across views. Training a\nradiance field network on the colorized grey-scale image sequence also does not\nsolve the 3D consistency issue. We propose a distillation based method to\ntransfer color knowledge from the colorization networks trained on natural\nimages to the radiance field network. Specifically, our method uses the\nradiance field network as a 3D representation and transfers knowledge from\nexisting 2D colorization methods. The experimental results demonstrate that the\nproposed method produces superior colorized novel views for indoor and outdoor\nscenes while maintaining cross-view consistency than baselines. Further, we\nshow the efficacy of our method on applications like colorization of radiance\nfield network trained from 1.) Infra-Red (IR) multi-view images and 2.) Old\ngrey-scale multi-view image sequences.\n","authors":["Ankit Dhiman","R Srinath","Srinjay Sarkar","Lokesh R Boregowda","R Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2309.07668v1.pdf","comment":"AI3DCC @ ICCV 2023"},{"id":"http://arxiv.org/abs/2309.07654v1","updated":"2023-09-14T12:17:38Z","published":"2023-09-14T12:17:38Z","title":"Towards Robust and Unconstrained Full Range of Rotation Head Pose\n Estimation","summary":" Estimating the head pose of a person is a crucial problem for numerous\napplications that is yet mainly addressed as a subtask of frontal pose\nprediction. We present a novel method for unconstrained end-to-end head pose\nestimation to tackle the challenging task of full range of orientation head\npose prediction. We address the issue of ambiguous rotation labels by\nintroducing the rotation matrix formalism for our ground truth data and propose\na continuous 6D rotation matrix representation for efficient and robust direct\nregression. This allows to efficiently learn full rotation appearance and to\novercome the limitations of the current state-of-the-art. Together with new\naccumulated training data that provides full head pose rotation data and a\ngeodesic loss approach for stable learning, we design an advanced model that is\nable to predict an extended range of head orientations. An extensive evaluation\non public datasets demonstrates that our method significantly outperforms other\nstate-of-the-art methods in an efficient and robust manner, while its advanced\nprediction range allows the expansion of the application area. We open-source\nour training and testing code along with our trained models:\nhttps://github.com/thohemp/6DRepNet360.\n","authors":["Thorsten Hempel","Ahmed A. Abdelrahman","Ayoub Al-Hamadi"],"pdf_url":"https://arxiv.org/pdf/2309.07654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07640v1","updated":"2023-09-14T12:05:29Z","published":"2023-09-14T12:05:29Z","title":"Indoor Scene Reconstruction with Fine-Grained Details Using Hybrid\n Representation and Normal Prior Enhancement","summary":" The reconstruction of indoor scenes from multi-view RGB images is challenging\ndue to the coexistence of flat and texture-less regions alongside delicate and\nfine-grained regions. Recent methods leverage neural radiance fields aided by\npredicted surface normal priors to recover the scene geometry. These methods\nexcel in producing complete and smooth results for floor and wall areas.\nHowever, they struggle to capture complex surfaces with high-frequency\nstructures due to the inadequate neural representation and the inaccurately\npredicted normal priors. To improve the capacity of the implicit\nrepresentation, we propose a hybrid architecture to represent low-frequency and\nhigh-frequency regions separately. To enhance the normal priors, we introduce a\nsimple yet effective image sharpening and denoising technique, coupled with a\nnetwork that estimates the pixel-wise uncertainty of the predicted surface\nnormal vectors. Identifying such uncertainty can prevent our model from being\nmisled by unreliable surface normal supervisions that hinder the accurate\nreconstruction of intricate geometries. Experiments on the benchmark datasets\nshow that our method significantly outperforms existing methods in terms of\nreconstruction quality.\n","authors":["Sheng Ye","Yubin Hu","Matthieu Lin","Yu-Hui Wen","Wang Zhao","Wenping Wang","Yong-Jin Liu"],"pdf_url":"https://arxiv.org/pdf/2309.07640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07623v1","updated":"2023-09-14T11:38:23Z","published":"2023-09-14T11:38:23Z","title":"SwitchGPT: Adapting Large Language Models for Non-Text Outputs","summary":" Large Language Models (LLMs), primarily trained on text-based datasets,\nexhibit exceptional proficiencies in understanding and executing complex\nlinguistic instructions via text outputs. However, they falter when requests to\ngenerate non-text ones. Concurrently, modality conversion models, such as\ntext-to-image, despite generating high-quality images, suffer from a lack of\nextensive textual pretraining. As a result, these models are only capable of\naccommodating specific image descriptions rather than comprehending more\ncomplex instructions. To bridge this gap, we propose a novel approach,\n\\methodname, from a modality conversion perspective that evolves a text-based\nLLM into a multi-modal one. We specifically employ a minimal dataset to\ninstruct LLMs to recognize the intended output modality as directed by the\ninstructions. Consequently, the adapted LLM can effectively summon various\noff-the-shelf modality conversion models from the model zoos to generate\nnon-text responses. This circumvents the necessity for complicated pretraining\nthat typically requires immense quantities of paired multi-modal data, while\nsimultaneously inheriting the extensive knowledge of LLMs and the ability of\nhigh-quality generative models. To evaluate and compare the adapted multi-modal\nLLM with its traditional counterparts, we have constructed a multi-modal\ninstruction benchmark that solicits diverse modality outputs. The experiment\nresults reveal that, with minimal training, LLMs can be conveniently adapted to\ncomprehend requests for non-text responses, thus achieving higher flexibility\nin multi-modal scenarios. Code and data will be made available at\nhttps://github.com/xinke-wang/SwitchGPT.\n","authors":["Xinyu Wang","Bohan Zhuang","Qi Wu"],"pdf_url":"https://arxiv.org/pdf/2309.07623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07616v1","updated":"2023-09-14T11:25:19Z","published":"2023-09-14T11:25:19Z","title":"Road Disease Detection based on Latent Domain Background Feature\n Separation and Suppression","summary":" Road disease detection is challenging due to the the small proportion of road\ndamage in target region and the diverse background,which introduce lots of\ndomain information.Besides, disease categories have high similarity,makes the\ndetection more difficult. In this paper, we propose a new LDBFSS(Latent Domain\nBackground Feature Separation and Suppression) network which could perform\nbackground information separation and suppression without domain supervision\nand contrastive enhancement of object features.We combine our LDBFSS network\nwith YOLOv5 model to enhance disease features for better road disease\ndetection. As the components of LDBFSS network, we first design a latent domain\ndiscovery module and a domain adversarial learning module to obtain pseudo\ndomain labels through unsupervised method, guiding domain discriminator and\nmodel to train adversarially to suppress background information. In addition,\nwe introduce a contrastive learning module and design k-instance contrastive\nloss, optimize the disease feature representation by increasing the inter-class\ndistance and reducing the intra-class distance for object features. We\nconducted experiments on two road disease detection datasets, GRDDC and CNRDD,\nand compared with other models,which show an increase of nearly 4% on GRDDC\ndataset compared with optimal model, and an increase of 4.6% on CNRDD dataset.\nExperimental results prove the effectiveness and superiority of our model.\n","authors":["Juwu Zheng","Jiangtao Ren"],"pdf_url":"https://arxiv.org/pdf/2309.07616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07609v1","updated":"2023-09-14T11:17:43Z","published":"2023-09-14T11:17:43Z","title":"Learning Quasi-Static 3D Models of Markerless Deformable Linear Objects\n for Bimanual Robotic Manipulation","summary":" The robotic manipulation of Deformable Linear Objects (DLOs) is a vital and\nchallenging task that is important in many practical applications. Classical\nmodel-based approaches to this problem require an accurate model to capture how\nrobot motions affect the deformation of the DLO. Nowadays, data-driven models\noffer the best tradeoff between quality and computation time. This paper\nanalyzes several learning-based 3D models of the DLO and proposes a new one\nbased on the Transformer architecture that achieves superior accuracy, even on\nthe DLOs of different lengths, thanks to the proposed scaling method. Moreover,\nwe introduce a data augmentation technique, which improves the prediction\nperformance of almost all considered DLO data-driven models. Thanks to this\ntechnique, even a simple Multilayer Perceptron (MLP) achieves close to\nstate-of-the-art performance while being significantly faster to evaluate. In\nthe experiments, we compare the performance of the learning-based 3D models of\nthe DLO on several challenging datasets quantitatively and demonstrate their\napplicability in the task of shaping a DLO.\n","authors":["Piotr Kicki","Michał Bidziński","Krzysztof Walas"],"pdf_url":"https://arxiv.org/pdf/2309.07609v1.pdf","comment":"Under review for IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2303.13867v2","updated":"2023-09-14T10:51:21Z","published":"2023-03-24T09:10:14Z","title":"Few Shot Medical Image Segmentation with Cross Attention Transformer","summary":" Medical image segmentation has made significant progress in recent years.\nDeep learning-based methods are recognized as data-hungry techniques, requiring\nlarge amounts of data with manual annotations. However, manual annotation is\nexpensive in the field of medical image analysis, which requires\ndomain-specific expertise. To address this challenge, few-shot learning has the\npotential to learn new classes from only a few examples. In this work, we\npropose a novel framework for few-shot medical image segmentation, termed\nCAT-Net, based on cross masked attention Transformer. Our proposed network\nmines the correlations between the support image and query image, limiting them\nto focus only on useful foreground information and boosting the representation\ncapacity of both the support prototype and query features. We further design an\niterative refinement framework that refines the query image segmentation\niteratively and promotes the support feature in turn. We validated the proposed\nmethod on three public datasets: Abd-CT, Abd-MRI, and Card-MRI. Experimental\nresults demonstrate the superior performance of our method compared to\nstate-of-the-art methods and the effectiveness of each component. Code:\nhttps://github.com/hust-linyi/CAT-Net.\n","authors":["Yi Lin","Yufan Chen","Kwang-Ting Cheng","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13867v2.pdf","comment":"Accepted by MICCAI 2023"},{"id":"http://arxiv.org/abs/2304.07097v2","updated":"2023-09-14T10:43:14Z","published":"2023-04-14T12:36:43Z","title":"Interpretable Weighted Siamese Network to Predict the Time to Onset of\n Alzheimer's Disease from MRI Images","summary":" Alzheimer's Disease (AD) is a progressive disease preceded by Mild Cognitive\nImpairment (MCI). Early detection of AD is crucial for making treatment\ndecisions. However, most of the literature on computer-assisted detection of AD\nfocuses on classifying brain images into one of three major categories:\nhealthy, MCI, and AD; or categorizing MCI patients into (1) progressive: those\nwho progress from MCI to AD at a future examination time, and (2) stable: those\nwho stay as MCI and never progress to AD. This misses the opportunity to\naccurately identify the trajectory of progressive MCI patients. In this paper,\nwe revisit the brain image classification task for AD identification and\nre-frame it as an ordinal classification task to predict how close a patient is\nto the severe AD stage. To this end, we select progressive MCI patients from\nthe Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset and construct an\nordinal dataset with a prediction target that indicates the time to progression\nto AD. We train a Siamese network model to predict the time to onset of AD\nbased on MRI brain images. We also propose a Weighted variety of Siamese\nnetwork and compare its performance to a baseline model. Our evaluations show\nthat incorporating a weighting factor to Siamese networks brings considerable\nperformance gain at predicting how close input brain MRI images are to\nprogressing to AD. Moreover, we complement our results with an interpretation\nof the learned embedding space of the Siamese networks using a model\nexplainability technique.\n","authors":["Misgina Tsighe Hagos","Niamh Belton","Ronan P. Killeen","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2304.07097v2.pdf","comment":"Accepted at the Specialist Group on Artificial Intelligence, SGAI\n 2023, conference"},{"id":"http://arxiv.org/abs/2309.05406v3","updated":"2023-09-14T10:22:07Z","published":"2023-09-11T12:12:52Z","title":"Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI\n Generation and Diffuse Glioma Growth Prediction","summary":" Diffuse gliomas are malignant brain tumors that grow widespread through the\nbrain. The complex interactions between neoplastic cells and normal tissue, as\nwell as the treatment-induced changes often encountered, make glioma tumor\ngrowth modeling challenging. In this paper, we present a novel end-to-end\nnetwork capable of generating future tumor masks and realistic MRIs of how the\ntumor will look at any future time points for different treatment plans. Our\napproach is based on cutting-edge diffusion probabilistic models and\ndeep-segmentation neural networks. We included sequential multi-parametric\nmagnetic resonance images (MRI) and treatment information as conditioning\ninputs to guide the generative diffusion process. This allows for tumor growth\nestimates at any given time point. We trained the model using real-world\npostoperative longitudinal MRI data with glioma tumor growth trajectories\nrepresented as tumor segmentation maps over time. The model has demonstrated\npromising performance across a range of tasks, including the generation of\nhigh-quality synthetic MRIs with tumor masks, time-series tumor segmentations,\nand uncertainty estimates. Combined with the treatment-aware generated MRIs,\nthe tumor growth predictions with uncertainty estimates can provide useful\ninformation for clinical decision-making.\n","authors":["Qinghui Liu","Elies Fuster-Garcia","Ivar Thokle Hovden","Donatas Sederevicius","Karoline Skogen","Bradley J MacIntosh","Edvard Grødem","Till Schellhorn","Petter Brandal","Atle Bjørnerud","Kyrre Eeg Emblem"],"pdf_url":"https://arxiv.org/pdf/2309.05406v3.pdf","comment":"13 pages, 10 figures, 2 tables, 2 agls, preprints in the IEEE trans.\n format for submission to IEEE-TMI"},{"id":"http://arxiv.org/abs/2309.06902v2","updated":"2023-09-14T09:58:54Z","published":"2023-09-13T12:00:33Z","title":"CCSPNet-Joint: Efficient Joint Training Method for Traffic Sign\n Detection Under Extreme Conditions","summary":" Traffic sign detection is an important research direction in intelligent\ndriving. Unfortunately, existing methods often overlook extreme conditions such\nas fog, rain, and motion blur. Moreover, the end-to-end training strategy for\nimage denoising and object detection models fails to utilize inter-model\ninformation effectively. To address these issues, we propose CCSPNet, an\nefficient feature extraction module based on Transformers and CNNs, which\neffectively leverages contextual information, achieves faster inference speed\nand provides stronger feature enhancement capabilities. Furthermore, we\nestablish the correlation between object detection and image denoising tasks\nand propose a joint training model, CCSPNet-Joint, to improve data efficiency\nand generalization. Finally, to validate our approach, we create the CCTSDB-AUG\ndataset for traffic sign detection in extreme scenarios. Extensive experiments\nhave shown that CCSPNet achieves state-of-the-art performance in traffic sign\ndetection under extreme conditions. Compared to end-to-end methods,\nCCSPNet-Joint achieves a 5.32% improvement in precision and an 18.09%\nimprovement in mAP@.5.\n","authors":["Haoqin Hong","Yue Zhou","Xiangyu Shu","Xiangfang Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06902v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06526v2","updated":"2023-09-14T09:23:18Z","published":"2023-07-13T02:19:56Z","title":"AvatarFusion: Zero-shot Generation of Clothing-Decoupled 3D Avatars\n Using 2D Diffusion","summary":" Large-scale pre-trained vision-language models allow for the zero-shot\ntext-based generation of 3D avatars. The previous state-of-the-art method\nutilized CLIP to supervise neural implicit models that reconstructed a human\nbody mesh. However, this approach has two limitations. Firstly, the lack of\navatar-specific models can cause facial distortion and unrealistic clothing in\nthe generated avatars. Secondly, CLIP only provides optimization direction for\nthe overall appearance, resulting in less impressive results. To address these\nlimitations, we propose AvatarFusion, the first framework to use a latent\ndiffusion model to provide pixel-level guidance for generating human-realistic\navatars while simultaneously segmenting clothing from the avatar's body.\nAvatarFusion includes the first clothing-decoupled neural implicit avatar model\nthat employs a novel Dual Volume Rendering strategy to render the decoupled\nskin and clothing sub-models in one space. We also introduce a novel\noptimization method, called Pixel-Semantics Difference-Sampling (PS-DS), which\nsemantically separates the generation of body and clothes, and generates a\nvariety of clothing styles. Moreover, we establish the first benchmark for\nzero-shot text-to-avatar generation. Our experimental results demonstrate that\nour framework outperforms previous approaches, with significant improvements\nobserved in all metrics. Additionally, since our model is clothing-decoupled,\nwe can exchange the clothes of avatars. Code are available on our project page\nhttps://hansenhuang0823.github.io/AvatarFusion.\n","authors":["Shuo Huang","Zongxin Yang","Liangting Li","Yi Yang","Jia Jia"],"pdf_url":"https://arxiv.org/pdf/2307.06526v2.pdf","comment":"Accepted by ACM Multimedia 2023"},{"id":"http://arxiv.org/abs/2309.07537v1","updated":"2023-09-14T09:03:57Z","published":"2023-09-14T09:03:57Z","title":"Universality of underlying mechanism for successful deep learning","summary":" An underlying mechanism for successful deep learning (DL) with a limited deep\narchitecture and dataset, namely VGG-16 on CIFAR-10, was recently presented\nbased on a quantitative method to measure the quality of a single filter in\neach layer. In this method, each filter identifies small clusters of possible\noutput labels, with additional noise selected as labels out of the clusters.\nThis feature is progressively sharpened with the layers, resulting in an\nenhanced signal-to-noise ratio (SNR) and higher accuracy. In this study, the\nsuggested universal mechanism is verified for VGG-16 and EfficientNet-B0\ntrained on the CIFAR-100 and ImageNet datasets with the following main results.\nFirst, the accuracy progressively increases with the layers, whereas the noise\nper filter typically progressively decreases. Second, for a given deep\narchitecture, the maximal error rate increases approximately linearly with the\nnumber of output labels. Third, the average filter cluster size and the number\nof clusters per filter at the last convolutional layer adjacent to the output\nlayer are almost independent of the number of dataset labels in the range [3,\n1,000], while a high SNR is preserved. The presented DL mechanism suggests\nseveral techniques, such as applying filter's cluster connections (AFCC), to\nimprove the computational complexity and accuracy of deep architectures and\nfurthermore pinpoints the simplification of pre-existing structures while\nmaintaining their accuracies.\n","authors":["Yuval Meir","Yarden Tzach","Shiri Hodassman","Ofek Tevet","Ido Kanter"],"pdf_url":"https://arxiv.org/pdf/2309.07537v1.pdf","comment":"27 pages,5 figures, 6 tables. arXiv admin note: text overlap with\n arXiv:2305.18078"},{"id":"http://arxiv.org/abs/2303.16617v2","updated":"2023-09-14T09:02:48Z","published":"2023-03-29T12:05:19Z","title":"NeFII: Inverse Rendering for Reflectance Decomposition with Near-Field\n Indirect Illumination","summary":" Inverse rendering methods aim to estimate geometry, materials and\nillumination from multi-view RGB images. In order to achieve better\ndecomposition, recent approaches attempt to model indirect illuminations\nreflected from different materials via Spherical Gaussians (SG), which,\nhowever, tends to blur the high-frequency reflection details. In this paper, we\npropose an end-to-end inverse rendering pipeline that decomposes materials and\nillumination from multi-view images, while considering near-field indirect\nillumination. In a nutshell, we introduce the Monte Carlo sampling based path\ntracing and cache the indirect illumination as neural radiance, enabling a\nphysics-faithful and easy-to-optimize inverse rendering method. To enhance\nefficiency and practicality, we leverage SG to represent the smooth environment\nilluminations and apply importance sampling techniques. To supervise indirect\nilluminations from unobserved directions, we develop a novel radiance\nconsistency constraint between implicit neural radiance and path tracing\nresults of unobserved rays along with the joint optimization of materials and\nilluminations, thus significantly improving the decomposition performance.\nExtensive experiments demonstrate that our method outperforms the\nstate-of-the-art on multiple synthetic and real datasets, especially in terms\nof inter-reflection decomposition.Our code and data are available at\nhttps://woolseyyy.github.io/nefii/.\n","authors":["Haoqian Wu","Zhipeng Hu","Lincheng Li","Yongqiang Zhang","Changjie Fan","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2303.16617v2.pdf","comment":"Accepted in CVPR 2023"},{"id":"http://arxiv.org/abs/2309.00655v2","updated":"2023-09-14T08:50:06Z","published":"2023-09-01T09:11:20Z","title":"RigNet++: Efficient Repetitive Image Guided Network for Depth Completion","summary":" Depth completion aims to recover dense depth maps from sparse ones, where\ncolor images are often used to facilitate this task. Recent depth methods\nprimarily focus on image guided learning frameworks. However, blurry guidance\nin the image and unclear structure in the depth still impede their performance.\nTo tackle these challenges, we explore an efficient repetitive design in our\nimage guided network to gradually and sufficiently recover depth values.\nSpecifically, the efficient repetition is embodied in both the image guidance\nbranch and depth generation branch. In the former branch, we design a dense\nrepetitive hourglass network to extract discriminative image features of\ncomplex environments, which can provide powerful contextual instruction for\ndepth prediction. In the latter branch, we introduce a repetitive guidance\nmodule based on dynamic convolution, in which an efficient convolution\nfactorization is proposed to reduce the complexity while modeling\nhigh-frequency structures progressively. Extensive experiments indicate that\nour approach achieves superior or competitive results on KITTI, VKITTI, NYUv2,\n3D60, and Matterport3D datasets.\n","authors":["Zhiqiang Yan","Xiang Li","Zhenyu Zhang","Jun Li","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2309.00655v2.pdf","comment":"15 pages. arXiv admin note: text overlap with arXiv:2107.13802"},{"id":"http://arxiv.org/abs/2309.07524v1","updated":"2023-09-14T08:48:44Z","published":"2023-09-14T08:48:44Z","title":"A Multi-scale Generalized Shrinkage Threshold Network for Image Blind\n Deblurring in Remote Sensing","summary":" Remote sensing images are essential for many earth science applications, but\ntheir quality can be degraded due to limitations in sensor technology and\ncomplex imaging environments. To address this, various remote sensing image\ndeblurring methods have been developed to restore sharp, high-quality images\nfrom degraded observational data. However, most traditional model-based\ndeblurring methods usually require predefined hand-craft prior assumptions,\nwhich are difficult to handle in complex applications, and most deep\nlearning-based deblurring methods are designed as a black box, lacking\ntransparency and interpretability. In this work, we propose a novel blind\ndeblurring learning framework based on alternating iterations of shrinkage\nthresholds, alternately updating blurring kernels and images, with the\ntheoretical foundation of network design. Additionally, we propose a learnable\nblur kernel proximal mapping module to improve the blur kernel evaluation in\nthe kernel domain. Then, we proposed a deep proximal mapping module in the\nimage domain, which combines a generalized shrinkage threshold operator and a\nmulti-scale prior feature extraction block. This module also introduces an\nattention mechanism to adaptively adjust the prior importance, thus avoiding\nthe drawbacks of hand-crafted image prior terms. Thus, a novel multi-scale\ngeneralized shrinkage threshold network (MGSTNet) is designed to specifically\nfocus on learning deep geometric prior features to enhance image restoration.\nExperiments demonstrate the superiority of our MGSTNet framework on remote\nsensing image datasets compared to existing deblurring methods.\n","authors":["Yujie Feng","Yin Yang","Xiaohong Fan","Zhengpeng Zhang","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07524v1.pdf","comment":"12 pages,"},{"id":"http://arxiv.org/abs/2309.07515v1","updated":"2023-09-14T08:32:05Z","published":"2023-09-14T08:32:05Z","title":"Dhan-Shomadhan: A Dataset of Rice Leaf Disease Classification for\n Bangladeshi Local Rice","summary":" This dataset represents almost all the harmful diseases for rice in\nBangladesh. This dataset consists of 1106 image of five harmful diseases called\nBrown Spot, Leaf Scaled, Rice Blast, Rice Turngo, Steath Blight in two\ndifferent background variation named field background picture and white\nbackground picture. Two different background variation helps the dataset to\nperform more accurately so that the user can use this data for field use as\nwell as white background for decision making. The data is collected from rice\nfield of Dhaka Division. This dataset can use for rice leaf diseases\nclassification, diseases detection using Computer Vision and Pattern\nRecognition for different rice leaf disease.\n","authors":["Md. Fahad Hossain"],"pdf_url":"https://arxiv.org/pdf/2309.07515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07513v1","updated":"2023-09-14T08:30:02Z","published":"2023-09-14T08:30:02Z","title":"RecycleNet: Latent Feature Recycling Leads to Iterative Decision\n Refinement","summary":" Despite the remarkable success of deep learning systems over the last decade,\na key difference still remains between neural network and human\ndecision-making: As humans, we cannot only form a decision on the spot, but\nalso ponder, revisiting an initial guess from different angles, distilling\nrelevant information, arriving at a better decision. Here, we propose\nRecycleNet, a latent feature recycling method, instilling the pondering\ncapability for neural networks to refine initial decisions over a number of\nrecycling steps, where outputs are fed back into earlier network layers in an\niterative fashion. This approach makes minimal assumptions about the neural\nnetwork architecture and thus can be implemented in a wide variety of contexts.\nUsing medical image segmentation as the evaluation environment, we show that\nlatent feature recycling enables the network to iteratively refine initial\npredictions even beyond the iterations seen during training, converging towards\nan improved decision. We evaluate this across a variety of segmentation\nbenchmarks and show consistent improvements even compared with top-performing\nsegmentation methods. This allows trading increased computation time for\nimproved performance, which can be beneficial, especially for safety-critical\napplications.\n","authors":["Gregor Koehler","Tassilo Wald","Constantin Ulrich","David Zimmerer","Paul F. Jaeger","Jörg K. H. Franke","Simon Kohl","Fabian Isensee","Klaus H. Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2309.07513v1.pdf","comment":"Accepted at 2024 Winter Conference on Applications of Computer Vision\n (WACV)"},{"id":"http://arxiv.org/abs/2309.07510v1","updated":"2023-09-14T08:24:32Z","published":"2023-09-14T08:24:32Z","title":"Learning Environment-Aware Affordance for 3D Articulated Object\n Manipulation under Occlusions","summary":" Perceiving and manipulating 3D articulated objects in diverse environments is\nessential for home-assistant robots. Recent studies have shown that point-level\naffordance provides actionable priors for downstream manipulation tasks.\nHowever, existing works primarily focus on single-object scenarios with\nhomogeneous agents, overlooking the realistic constraints imposed by the\nenvironment and the agent's morphology, e.g., occlusions and physical\nlimitations. In this paper, we propose an environment-aware affordance\nframework that incorporates both object-level actionable priors and environment\nconstraints. Unlike object-centric affordance approaches, learning\nenvironment-aware affordance faces the challenge of combinatorial explosion due\nto the complexity of various occlusions, characterized by their quantities,\ngeometries, positions and poses. To address this and enhance data efficiency,\nwe introduce a novel contrastive affordance learning framework capable of\ntraining on scenes containing a single occluder and generalizing to scenes with\ncomplex occluder combinations. Experiments demonstrate the effectiveness of our\nproposed approach in learning affordance considering environment constraints.\n","authors":["Kai Cheng","Ruihai Wu","Yan Shen","Chuanruo Ning","Guanqi Zhan","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.07510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07509v1","updated":"2023-09-14T08:22:34Z","published":"2023-09-14T08:22:34Z","title":"DiffTalker: Co-driven audio-image diffusion for talking faces via\n intermediate landmarks","summary":" Generating realistic talking faces is a complex and widely discussed task\nwith numerous applications. In this paper, we present DiffTalker, a novel model\ndesigned to generate lifelike talking faces through audio and landmark\nco-driving. DiffTalker addresses the challenges associated with directly\napplying diffusion models to audio control, which are traditionally trained on\ntext-image pairs. DiffTalker consists of two agent networks: a\ntransformer-based landmarks completion network for geometric accuracy and a\ndiffusion-based face generation network for texture details. Landmarks play a\npivotal role in establishing a seamless connection between the audio and image\ndomains, facilitating the incorporation of knowledge from pre-trained diffusion\nmodels. This innovative approach efficiently produces articulate-speaking\nfaces. Experimental results showcase DiffTalker's superior performance in\nproducing clear and geometrically accurate talking faces, all without the need\nfor additional alignment between audio and image features.\n","authors":["Zipeng Qi","Xulong Zhang","Ning Cheng","Jing Xiao","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2309.07509v1.pdf","comment":"submmit to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.07499v1","updated":"2023-09-14T08:07:49Z","published":"2023-09-14T08:07:49Z","title":"Efficiently Robustify Pre-trained Models","summary":" A recent trend in deep learning algorithms has been towards training large\nscale models, having high parameter count and trained on big dataset. However,\nrobustness of such large scale models towards real-world settings is still a\nless-explored topic. In this work, we first benchmark the performance of these\nmodels under different perturbations and datasets thereby representing\nreal-world shifts, and highlight their degrading performance under these\nshifts. We then discuss on how complete model fine-tuning based existing\nrobustification schemes might not be a scalable option given very large scale\nnetworks and can also lead them to forget some of the desired characterstics.\nFinally, we propose a simple and cost-effective method to solve this problem,\ninspired by knowledge transfer literature. It involves robustifying smaller\nmodels, at a lower computation cost, and then use them as teachers to tune a\nfraction of these large scale networks, reducing the overall computational\noverhead. We evaluate our proposed method under various vision perturbations\nincluding ImageNet-C,R,S,A datasets and also for transfer learning, zero-shot\nevaluation setups on different datasets. Benchmark results show that our method\nis able to induce robustness to these large scale models efficiently, requiring\nsignificantly lower time and also preserves the transfer learning, zero-shot\nproperties of the original model which none of the existing methods are able to\nachieve.\n","authors":["Nishant Jain","Harkirat Behl","Yogesh Singh Rawat","Vibhav Vineet"],"pdf_url":"https://arxiv.org/pdf/2309.07499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07495v1","updated":"2023-09-14T07:58:31Z","published":"2023-09-14T07:58:31Z","title":"HDTR-Net: A Real-Time High-Definition Teeth Restoration Network for\n Arbitrary Talking Face Generation Methods","summary":" Talking Face Generation (TFG) aims to reconstruct facial movements to achieve\nhigh natural lip movements from audio and facial features that are under\npotential connections. Existing TFG methods have made significant advancements\nto produce natural and realistic images. However, most work rarely takes visual\nquality into consideration. It is challenging to ensure lip synchronization\nwhile avoiding visual quality degradation in cross-modal generation methods. To\naddress this issue, we propose a universal High-Definition Teeth Restoration\nNetwork, dubbed HDTR-Net, for arbitrary TFG methods. HDTR-Net can enhance teeth\nregions at an extremely fast speed while maintaining synchronization, and\ntemporal consistency. In particular, we propose a Fine-Grained Feature Fusion\n(FGFF) module to effectively capture fine texture feature information around\nteeth and surrounding regions, and use these features to fine-grain the feature\nmap to enhance the clarity of teeth. Extensive experiments show that our method\ncan be adapted to arbitrary TFG methods without suffering from lip\nsynchronization and frame coherence. Another advantage of HDTR-Net is its\nreal-time generation ability. Also under the condition of high-definition\nrestoration of talking face video synthesis, its inference speed is $300\\%$\nfaster than the current state-of-the-art face restoration based on\nsuper-resolution.\n","authors":["Yongyuan Li","Xiuyuan Qin","Chao Liang","Mingqiang Wei"],"pdf_url":"https://arxiv.org/pdf/2309.07495v1.pdf","comment":"15pages, 6 figures, PRCV2023"},{"id":"http://arxiv.org/abs/2305.07180v2","updated":"2023-09-14T07:29:01Z","published":"2023-05-12T00:13:17Z","title":"Robust Saliency-Aware Distillation for Few-shot Fine-grained Visual\n Recognition","summary":" Recognizing novel sub-categories with scarce samples is an essential and\nchallenging research topic in computer vision. Existing literature addresses\nthis challenge by employing local-based representation approaches, which may\nnot sufficiently facilitate meaningful object-specific semantic understanding,\nleading to a reliance on apparent background correlations. Moreover, they\nprimarily rely on high-dimensional local descriptors to construct complex\nembedding space, potentially limiting the generalization. To address the above\nchallenges, this article proposes a novel model called RSaG for few-shot\nfine-grained visual recognition. RSaG introduces additional saliency-aware\nsupervision via saliency detection to guide the model toward focusing on the\nintrinsic discriminative regions. Specifically, RSaG utilizes the saliency\ndetection model to emphasize the critical regions of each sub-category,\nproviding additional object-specific information for fine-grained prediction.\nRSaG transfers such information with two symmetric branches in a mutual\nlearning paradigm. Furthermore, RSaG exploits inter-regional relationships to\nenhance the informativeness of the representation and subsequently summarize\nthe highlighted details into contextual embeddings to facilitate the effective\ntransfer, enabling quick generalization to novel sub-categories. The proposed\napproach is empirically evaluated on three widely used benchmarks,\ndemonstrating its superior performance.\n","authors":["Haiqi Liu","C. L. Philip Chen","Xinrong Gong","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.07180v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2309.06745v2","updated":"2023-09-14T07:13:24Z","published":"2023-09-13T06:31:35Z","title":"VEATIC: Video-based Emotion and Affect Tracking in Context Dataset","summary":" Human affect recognition has been a significant topic in psychophysics and\ncomputer vision. However, the currently published datasets have many\nlimitations. For example, most datasets contain frames that contain only\ninformation about facial expressions. Due to the limitations of previous\ndatasets, it is very hard to either understand the mechanisms for affect\nrecognition of humans or generalize well on common cases for computer vision\nmodels trained on those datasets. In this work, we introduce a brand new large\ndataset, the Video-based Emotion and Affect Tracking in Context Dataset\n(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has\n124 video clips from Hollywood movies, documentaries, and home videos with\ncontinuous valence and arousal ratings of each frame via real-time annotation.\nAlong with the dataset, we propose a new computer vision task to infer the\naffect of the selected character via both context and character information in\neach video frame. Additionally, we propose a simple model to benchmark this new\ncomputer vision task. We also compare the performance of the pretrained model\nusing our dataset with other similar datasets. Experiments show the competing\nresults of our pretrained model via VEATIC, indicating the generalizability of\nVEATIC. Our dataset is available at https://veatic.github.io.\n","authors":["Zhihang Ren","Jefferson Ortega","Yifan Wang","Zhimin Chen","Yunhui Guo","Stella X. Yu","David Whitney"],"pdf_url":"https://arxiv.org/pdf/2309.06745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07471v1","updated":"2023-09-14T07:06:36Z","published":"2023-09-14T07:06:36Z","title":"EP2P-Loc: End-to-End 3D Point to 2D Pixel Localization for Large-Scale\n Visual Localization","summary":" Visual localization is the task of estimating a 6-DoF camera pose of a query\nimage within a provided 3D reference map. Thanks to recent advances in various\n3D sensors, 3D point clouds are becoming a more accurate and affordable option\nfor building the reference map, but research to match the points of 3D point\nclouds with pixels in 2D images for visual localization remains challenging.\nExisting approaches that jointly learn 2D-3D feature matching suffer from low\ninliers due to representational differences between the two modalities, and the\nmethods that bypass this problem into classification have an issue of poor\nrefinement. In this work, we propose EP2P-Loc, a novel large-scale visual\nlocalization method that mitigates such appearance discrepancy and enables\nend-to-end training for pose estimation. To increase the number of inliers, we\npropose a simple algorithm to remove invisible 3D points in the image, and find\nall 2D-3D correspondences without keypoint detection. To reduce memory usage\nand search complexity, we take a coarse-to-fine approach where we extract\npatch-level features from 2D images, then perform 2D patch classification on\neach 3D point, and obtain the exact corresponding 2D pixel coordinates through\npositional encoding. Finally, for the first time in this task, we employ a\ndifferentiable PnP for end-to-end training. In the experiments on newly curated\nlarge-scale indoor and outdoor benchmarks based on 2D-3D-S and KITTI, we show\nthat our method achieves the state-of-the-art performance compared to existing\nvisual localization and image-to-point cloud registration methods.\n","authors":["Minjung Kim","Junseo Koo","Gunhee Kim"],"pdf_url":"https://arxiv.org/pdf/2309.07471v1.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2309.07461v1","updated":"2023-09-14T06:41:45Z","published":"2023-09-14T06:41:45Z","title":"Detecting Unknown Attacks in IoT Environments: An Open Set Classifier\n for Enhanced Network Intrusion Detection","summary":" The widespread integration of Internet of Things (IoT) devices across all\nfacets of life has ushered in an era of interconnectedness, creating new\navenues for cybersecurity challenges and underscoring the need for robust\nintrusion detection systems. However, traditional security systems are designed\nwith a closed-world perspective and often face challenges in dealing with the\never-evolving threat landscape, where new and unfamiliar attacks are constantly\nemerging. In this paper, we introduce a framework aimed at mitigating the open\nset recognition (OSR) problem in the realm of Network Intrusion Detection\nSystems (NIDS) tailored for IoT environments. Our framework capitalizes on\nimage-based representations of packet-level data, extracting spatial and\ntemporal patterns from network traffic. Additionally, we integrate stacking and\nsub-clustering techniques, enabling the identification of unknown attacks by\neffectively modeling the complex and diverse nature of benign behavior. The\nempirical results prominently underscore the framework's efficacy, boasting an\nimpressive 88\\% detection rate for previously unseen attacks when compared\nagainst existing approaches and recent advancements. Future work will perform\nextensive experimentation across various openness levels and attack scenarios,\nfurther strengthening the adaptability and performance of our proposed solution\nin safeguarding IoT environments.\n","authors":["Yasir Ali Farrukh","Syed Wali","Irfan Khan","Nathaniel D. Bastian"],"pdf_url":"https://arxiv.org/pdf/2309.07461v1.pdf","comment":"6 Pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.06987v2","updated":"2023-09-14T06:40:13Z","published":"2023-09-13T14:26:03Z","title":"Instance Adaptive Prototypical Contrastive Embedding for Generalized\n Zero Shot Learning","summary":" Generalized zero-shot learning(GZSL) aims to classify samples from seen and\nunseen labels, assuming unseen labels are not accessible during training.\nRecent advancements in GZSL have been expedited by incorporating\ncontrastive-learning-based (instance-based) embedding in generative networks\nand leveraging the semantic relationship between data points. However, existing\nembedding architectures suffer from two limitations: (1) limited\ndiscriminability of synthetic features' embedding without considering\nfine-grained cluster structures; (2) inflexible optimization due to restricted\nscaling mechanisms on existing contrastive embedding networks, leading to\noverlapped representations in the embedding space. To enhance the quality of\nrepresentations in the embedding space, as mentioned in (1), we propose a\nmargin-based prototypical contrastive learning embedding network that reaps the\nbenefits of prototype-data (cluster quality enhancement) and implicit data-data\n(fine-grained representations) interaction while providing substantial cluster\nsupervision to the embedding network and the generator. To tackle (2), we\npropose an instance adaptive contrastive loss that leads to generalized\nrepresentations for unseen labels with increased inter-class margin. Through\ncomprehensive experimental evaluation, we show that our method can outperform\nthe current state-of-the-art on three benchmark datasets. Our approach also\nconsistently achieves the best unseen performance in the GZSL setting.\n","authors":["Riti Paul","Sahil Vora","Baoxin Li"],"pdf_url":"https://arxiv.org/pdf/2309.06987v2.pdf","comment":"7 pages, 4 figures. Accepted in IJCAI 2023 Workshop on Generalizing\n from Limited Resources in the Open World"},{"id":"http://arxiv.org/abs/2309.07444v1","updated":"2023-09-14T05:54:54Z","published":"2023-09-14T05:54:54Z","title":"Research on self-cross transformer model of point cloud change detecter","summary":" With the vigorous development of the urban construction industry, engineering\ndeformation or changes often occur during the construction process. To combat\nthis phenomenon, it is necessary to detect changes in order to detect\nconstruction loopholes in time, ensure the integrity of the project and reduce\nlabor costs. Or the inconvenience and injuriousness of the road. In the study\nof change detection in 3D point clouds, researchers have published various\nresearch methods on 3D point clouds. Directly based on but mostly based\nontraditional threshold distance methods (C2C, M3C2, M3C2-EP), and some are to\nconvert 3D point clouds into DSM, which loses a lot of original information.\nAlthough deep learning is used in remote sensing methods, in terms of change\ndetection of 3D point clouds, it is more converted into two-dimensional\npatches, and neural networks are rarely applied directly. We prefer that the\nnetwork is given at the level of pixels or points. Variety. Therefore, in this\narticle, our network builds a network for 3D point cloud change detection, and\nproposes a new module Cross transformer suitable for change detection.\nSimultaneously simulate tunneling data for change detection, and do test\nexperiments with our network.\n","authors":["Xiaoxu Ren","Haili Sun","Zhenxin Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07439v1","updated":"2023-09-14T05:45:40Z","published":"2023-09-14T05:45:40Z","title":"DePT: Decoupled Prompt Tuning","summary":" This work breaks through the Base-New Tradeoff (BNT)dilemma in prompt tuning,\ni.e., the better the tuned model generalizes to the base (or target) task, the\nworse it generalizes to new tasks, and vice versa. Specifically, through an\nin-depth analysis of the learned features of the base and new tasks, we observe\nthat the BNT stems from a channel bias issue, i.e., the vast majority of\nfeature channels are occupied by base-specific knowledge, resulting in the\ncollapse of taskshared knowledge important to new tasks. To address this, we\npropose the Decoupled Prompt Tuning (DePT) framework, which decouples\nbase-specific knowledge from feature channels into an isolated feature space\nduring prompt tuning, so as to maximally preserve task-shared knowledge in the\noriginal feature space for achieving better zero-shot generalization on new\ntasks. Importantly, our DePT is orthogonal to existing prompt tuning methods,\nhence it can improve all of them. Extensive experiments on 11 datasets show the\nstrong flexibility and effectiveness of DePT. Our code and pretrained models\nare available at https://github.com/Koorye/DePT.\n","authors":["Ji Zhang","Shihan Wu","Lianli Gao","Hengtao Shen","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2309.07439v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2212.09597v7","updated":"2023-09-14T05:36:15Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":" Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v7.pdf","comment":"ACL 2023, 24 pages, add references of theoretical analysis"},{"id":"http://arxiv.org/abs/2309.00827v2","updated":"2023-09-14T05:33:44Z","published":"2023-09-02T05:05:40Z","title":"Few shot font generation via transferring similarity guided global style\n and quantization local style","summary":" Automatic few-shot font generation (AFFG), aiming at generating new fonts\nwith only a few glyph references, reduces the labor cost of manually designing\nfonts. However, the traditional AFFG paradigm of style-content disentanglement\ncannot capture the diverse local details of different fonts. So, many\ncomponent-based approaches are proposed to tackle this problem. The issue with\ncomponent-based approaches is that they usually require special pre-defined\nglyph components, e.g., strokes and radicals, which is infeasible for AFFG of\ndifferent languages. In this paper, we present a novel font generation approach\nby aggregating styles from character similarity-guided global features and\nstylized component-level representations. We calculate the similarity scores of\nthe target character and the referenced samples by measuring the distance along\nthe corresponding channels from the content features, and assigning them as the\nweights for aggregating the global style features. To better capture the local\nstyles, a cross-attention-based style transfer module is adopted to transfer\nthe styles of reference glyphs to the components, where the components are\nself-learned discrete latent codes through vector quantization without manual\ndefinition. With these designs, our AFFG method could obtain a complete set of\ncomponent-level style representations, and also control the global glyph\ncharacteristics. The experimental results reflect the effectiveness and\ngeneralization of the proposed method on different linguistic scripts, and also\nshow its superiority when compared with other state-of-the-art methods. The\nsource code can be found at https://github.com/awei669/VQ-Font.\n","authors":["Wei Pan","Anna Zhu","Xinyu Zhou","Brian Kenji Iwana","Shilin Li"],"pdf_url":"https://arxiv.org/pdf/2309.00827v2.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2309.07428v1","updated":"2023-09-14T04:58:06Z","published":"2023-09-14T04:58:06Z","title":"Physical Invisible Backdoor Based on Camera Imaging","summary":" Backdoor attack aims to compromise a model, which returns an adversary-wanted\noutput when a specific trigger pattern appears yet behaves normally for clean\ninputs. Current backdoor attacks require changing pixels of clean images, which\nresults in poor stealthiness of attacks and increases the difficulty of the\nphysical implementation. This paper proposes a novel physical invisible\nbackdoor based on camera imaging without changing nature image pixels.\nSpecifically, a compromised model returns a target label for images taken by a\nparticular camera, while it returns correct results for other images. To\nimplement and evaluate the proposed backdoor, we take shots of different\nobjects from multi-angles using multiple smartphones to build a new dataset of\n21,500 images. Conventional backdoor attacks work ineffectively with some\nclassical models, such as ResNet18, over the above-mentioned dataset.\nTherefore, we propose a three-step training strategy to mount the backdoor\nattack. First, we design and train a camera identification model with the phone\nIDs to extract the camera fingerprint feature. Subsequently, we elaborate a\nspecial network architecture, which is easily compromised by our backdoor\nattack, by leveraging the attributes of the CFA interpolation algorithm and\ncombining it with the feature extraction block in the camera identification\nmodel. Finally, we transfer the backdoor from the elaborated special network\narchitecture to the classical architecture model via teacher-student\ndistillation learning. Since the trigger of our method is related to the\nspecific phone, our attack works effectively in the physical world. Experiment\nresults demonstrate the feasibility of our proposed approach and robustness\nagainst various backdoor defenses.\n","authors":["Yusheng Guo","Nan Zhong","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07428v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07425v1","updated":"2023-09-14T04:45:09Z","published":"2023-09-14T04:45:09Z","title":"JSMNet Improving Indoor Point Cloud Semantic and Instance Segmentation\n through Self-Attention and Multiscale","summary":" The semantic understanding of indoor 3D point cloud data is crucial for a\nrange of subsequent applications, including indoor service robots, navigation\nsystems, and digital twin engineering. Global features are crucial for\nachieving high-quality semantic and instance segmentation of indoor point\nclouds, as they provide essential long-range context information. To this end,\nwe propose JSMNet, which combines a multi-layer network with a global feature\nself-attention module to jointly segment three-dimensional point cloud\nsemantics and instances. To better express the characteristics of indoor\ntargets, we have designed a multi-resolution feature adaptive fusion module\nthat takes into account the differences in point cloud density caused by\nvarying scanner distances from the target. Additionally, we propose a framework\nfor joint semantic and instance segmentation by integrating semantic and\ninstance features to achieve superior results. We conduct experiments on S3DIS,\nwhich is a large three-dimensional indoor point cloud dataset. Our proposed\nmethod is compared against other methods, and the results show that it\noutperforms existing methods in semantic and instance segmentation and provides\nbetter results in target local area segmentation. Specifically, our proposed\nmethod outperforms PointNet (Qi et al., 2017a) by 16.0% and 26.3% in terms of\nsemantic segmentation mIoU in S3DIS (Area 5) and instance segmentation mPre,\nrespectively. Additionally, it surpasses ASIS (Wang et al., 2019) by 6.0% and\n4.6%, respectively, as well as JSPNet (Chen et al., 2022) by a margin of 3.3%\nfor semantic segmentation mIoU and a slight improvement of 0.3% for instance\nsegmentation mPre.\n","authors":["Shuochen Xu","Zhenxin Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08657v3","updated":"2023-09-14T04:26:01Z","published":"2023-03-15T14:41:17Z","title":"Economical Quaternion Extraction from a Human Skeletal Pose Estimate\n using 2-D Cameras","summary":" In this paper, we present a novel algorithm to extract a quaternion from a\ntwo dimensional camera frame for estimating a contained human skeletal pose.\nThe problem of pose estimation is usually tackled through the usage of stereo\ncameras and intertial measurement units for obtaining depth and euclidean\ndistance for measurement of points in 3D space. However, the usage of these\ndevices comes with a high signal processing latency as well as a significant\nmonetary cost. By making use of MediaPipe, a framework for building perception\npipelines for human pose estimation, the proposed algorithm extracts a\nquaternion from a 2-D frame capturing an image of a human object at a sub-fifty\nmillisecond latency while also being capable of deployment at edges with a\nsingle camera frame and a generally low computational resource availability,\nespecially for use cases involving last-minute detection and reaction by\nautonomous robots. The algorithm seeks to bypass the funding barrier and\nimprove accessibility for robotics researchers involved in designing control\nsystems.\n","authors":["Sriram Radhakrishna","Adithya Balasubramanyam"],"pdf_url":"https://arxiv.org/pdf/2303.08657v3.pdf","comment":"This is the post-final version of the paper published with IEEE\n CONECCT 2023 with some figure reference errors rectified"},{"id":"http://arxiv.org/abs/2304.08981v2","updated":"2023-09-14T04:03:28Z","published":"2023-04-18T13:23:42Z","title":"MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised\n Learning","summary":" The first Multimodal Emotion Recognition Challenge (MER 2023) was\nsuccessfully held at ACM Multimedia. The challenge focuses on system robustness\nand consists of three distinct tracks: (1) MER-MULTI, where participants are\nrequired to recognize both discrete and dimensional emotions; (2) MER-NOISE, in\nwhich noise is added to test videos for modality robustness evaluation; (3)\nMER-SEMI, which provides a large amount of unlabeled samples for\nsemi-supervised learning. In this paper, we introduce the motivation behind\nthis challenge, describe the benchmark dataset, and provide some statistics\nabout participants. To continue using this dataset after MER 2023, please sign\na new End User License Agreement and send it to our official email address\nmerchallenge.contact@gmail.com. We believe this high-quality dataset can become\na new benchmark in multimodal emotion recognition, especially for the Chinese\nresearch community.\n","authors":["Zheng Lian","Haiyang Sun","Licai Sun","Kang Chen","Mingyu Xu","Kexin Wang","Ke Xu","Yu He","Ying Li","Jinming Zhao","Ye Liu","Bin Liu","Jiangyan Yi","Meng Wang","Erik Cambria","Guoying Zhao","Björn W. Schuller","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2304.08981v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09858v3","updated":"2023-09-14T03:37:24Z","published":"2023-03-17T09:37:41Z","title":"Preventing Unauthorized AI Over-Analysis by Medical Image Adversarial\n Watermarking","summary":" The advancement of deep learning has facilitated the integration of\nArtificial Intelligence (AI) into clinical practices, particularly in\ncomputer-aided diagnosis. Given the pivotal role of medical images in various\ndiagnostic procedures, it becomes imperative to ensure the responsible and\nsecure utilization of AI techniques. However, the unauthorized utilization of\nAI for image analysis raises significant concerns regarding patient privacy and\npotential infringement on the proprietary rights of data custodians.\nConsequently, the development of pragmatic and cost-effective strategies that\nsafeguard patient privacy and uphold medical image copyrights emerges as a\ncritical necessity. In direct response to this pressing demand, we present a\npioneering solution named Medical Image Adversarial watermarking (MIAD-MARK).\nOur approach introduces watermarks that strategically mislead unauthorized AI\ndiagnostic models, inducing erroneous predictions without compromising the\nintegrity of the visual content. Importantly, our method integrates an\nauthorization protocol tailored for legitimate users, enabling the removal of\nthe MIAD-MARK through encryption-generated keys. Through extensive experiments,\nwe validate the efficacy of MIAD-MARK across three prominent medical image\ndatasets. The empirical outcomes demonstrate the substantial impact of our\napproach, notably reducing the accuracy of standard AI diagnostic models to a\nmere 8.57% under white box conditions and 45.83% in the more challenging black\nbox scenario. Additionally, our solution effectively mitigates unauthorized\nexploitation of medical images even in the presence of sophisticated watermark\nremoval networks. Notably, those AI diagnosis networks exhibit a meager average\naccuracy of 38.59% when applied to images protected by MIAD-MARK, underscoring\nthe robustness of our safeguarding mechanism.\n","authors":["Xingxing Wei","Bangzheng Pu","Shiji Zhao","Chen Chi","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09858v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07409v1","updated":"2023-09-14T03:25:37Z","published":"2023-09-14T03:25:37Z","title":"Masked Diffusion with Task-awareness for Procedure Planning in\n Instructional Videos","summary":" A key challenge with procedure planning in instructional videos lies in how\nto handle a large decision space consisting of a multitude of action types that\nbelong to various tasks. To understand real-world video content, an AI agent\nmust proficiently discern these action types (e.g., pour milk, pour water, open\nlid, close lid, etc.) based on brief visual observation. Moreover, it must\nadeptly capture the intricate semantic relation of the action types and task\ngoals, along with the variable action sequences. Recently, notable progress has\nbeen made via the integration of diffusion models and visual representation\nlearning to address the challenge. However, existing models employ rudimentary\nmechanisms to utilize task information to manage the decision space. To\novercome this limitation, we introduce a simple yet effective enhancement - a\nmasked diffusion model. The introduced mask acts akin to a task-oriented\nattention filter, enabling the diffusion/denoising process to concentrate on a\nsubset of action types. Furthermore, to bolster the accuracy of task\nclassification, we harness more potent visual representation learning\ntechniques. In particular, we learn a joint visual-text embedding, where a text\nembedding is generated by prompting a pre-trained vision-language model to\nfocus on human actions. We evaluate the method on three public datasets and\nachieve state-of-the-art performance on multiple metrics. Code is available at\nhttps://github.com/ffzzy840304/Masked-PDPP.\n","authors":["Fen Fang","Yun Liu","Ali Koksal","Qianli Xu","Joo-Hwee Lim"],"pdf_url":"https://arxiv.org/pdf/2309.07409v1.pdf","comment":"7 pages (main text excluding references), 3 figures, 7 tables"},{"id":"http://arxiv.org/abs/2309.07403v1","updated":"2023-09-14T03:16:05Z","published":"2023-09-14T03:16:05Z","title":"Flexible Visual Recognition by Evidential Modeling of Confusion and\n Ignorance","summary":" In real-world scenarios, typical visual recognition systems could fail under\ntwo major causes, i.e., the misclassification between known classes and the\nexcusable misbehavior on unknown-class images. To tackle these deficiencies,\nflexible visual recognition should dynamically predict multiple classes when\nthey are unconfident between choices and reject making predictions when the\ninput is entirely out of the training distribution. Two challenges emerge along\nwith this novel task. First, prediction uncertainty should be separately\nquantified as confusion depicting inter-class uncertainties and ignorance\nidentifying out-of-distribution samples. Second, both confusion and ignorance\nshould be comparable between samples to enable effective decision-making. In\nthis paper, we propose to model these two sources of uncertainty explicitly\nwith the theory of Subjective Logic. Regarding recognition as an\nevidence-collecting process, confusion is then defined as conflicting evidence,\nwhile ignorance is the absence of evidence. By predicting Dirichlet\nconcentration parameters for singletons, comprehensive subjective opinions,\nincluding confusion and ignorance, could be achieved via further evidence\ncombinations. Through a series of experiments on synthetic data analysis,\nvisual recognition, and open-set detection, we demonstrate the effectiveness of\nour methods in quantifying two sources of uncertainties and dealing with\nflexible recognition.\n","authors":["Lei Fan","Bo Liu","Haoxiang Li","Ying Wu","Gang Hua"],"pdf_url":"https://arxiv.org/pdf/2309.07403v1.pdf","comment":"Accepted by ICCV23"},{"id":"http://arxiv.org/abs/2309.07400v1","updated":"2023-09-14T03:04:06Z","published":"2023-09-14T03:04:06Z","title":"HIGT: Hierarchical Interaction Graph-Transformer for Whole Slide Image\n Analysis","summary":" In computation pathology, the pyramid structure of gigapixel Whole Slide\nImages (WSIs) has recently been studied for capturing various information from\nindividual cell interactions to tissue microenvironments. This hierarchical\nstructure is believed to be beneficial for cancer diagnosis and prognosis\ntasks. However, most previous hierarchical WSI analysis works (1) only\ncharacterize local or global correlations within the WSI pyramids and (2) use\nonly unidirectional interaction between different resolutions, leading to an\nincomplete picture of WSI pyramids. To this end, this paper presents a novel\nHierarchical Interaction Graph-Transformer (i.e., HIGT) for WSI analysis. With\nGraph Neural Network and Transformer as the building commons, HIGT can learn\nboth short-range local information and long-range global representation of the\nWSI pyramids. Considering that the information from different resolutions is\ncomplementary and can benefit each other during the learning process, we\nfurther design a novel Bidirectional Interaction block to establish\ncommunication between different levels within the WSI pyramids. Finally, we\naggregate both coarse-grained and fine-grained features learned from different\nlevels together for slide-level prediction. We evaluate our methods on two\npublic WSI datasets from TCGA projects, i.e., kidney carcinoma (KICA) and\nesophageal carcinoma (ESCA). Experimental results show that our HIGT\noutperforms both hierarchical and non-hierarchical state-of-the-art methods on\nboth tumor subtyping and staging tasks.\n","authors":["Ziyu Guo","Weiqin Zhao","Shujun Wang","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2309.07400v1.pdf","comment":"Accepted by MICCAI2023; Code is available in\n https://github.com/HKU-MedAI/HIGT"},{"id":"http://arxiv.org/abs/2309.07398v1","updated":"2023-09-14T02:57:48Z","published":"2023-09-14T02:57:48Z","title":"Semantic Adversarial Attacks via Diffusion Models","summary":" Traditional adversarial attacks concentrate on manipulating clean examples in\nthe pixel space by adding adversarial perturbations. By contrast, semantic\nadversarial attacks focus on changing semantic attributes of clean examples,\nsuch as color, context, and features, which are more feasible in the real\nworld. In this paper, we propose a framework to quickly generate a semantic\nadversarial attack by leveraging recent diffusion models since semantic\ninformation is included in the latent space of well-trained diffusion models.\nThen there are two variants of this framework: 1) the Semantic Transformation\n(ST) approach fine-tunes the latent space of the generated image and/or the\ndiffusion model itself; 2) the Latent Masking (LM) approach masks the latent\nspace with another target image and local backpropagation-based interpretation\nmethods. Additionally, the ST approach can be applied in either white-box or\nblack-box settings. Extensive experiments are conducted on CelebA-HQ and AFHQ\ndatasets, and our framework demonstrates great fidelity, generalizability, and\ntransferability compared to other baselines. Our approaches achieve\napproximately 100% attack success rate in multiple settings with the best FID\nas 36.61. Code is available at\nhttps://github.com/steven202/semantic_adv_via_dm.\n","authors":["Chenan Wang","Jinhao Duan","Chaowei Xiao","Edward Kim","Matthew Stamm","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2309.07398v1.pdf","comment":"To appear in BMVC 2023"},{"id":"http://arxiv.org/abs/2309.06724v2","updated":"2023-09-14T02:44:57Z","published":"2023-09-13T04:57:12Z","title":"Deep Nonparametric Convexified Filtering for Computational Photography,\n Image Synthesis and Adversarial Defense","summary":" We aim to provide a general framework of for computational photography that\nrecovers the real scene from imperfect images, via the Deep Nonparametric\nConvexified Filtering (DNCF). It is consists of a nonparametric deep network to\nresemble the physical equations behind the image formation, such as denoising,\nsuper-resolution, inpainting, and flash. DNCF has no parameterization dependent\non training data, therefore has a strong generalization and robustness to\nadversarial image manipulation. During inference, we also encourage the network\nparameters to be nonnegative and create a bi-convex function on the input and\nparameters, and this adapts to second-order optimization algorithms with\ninsufficient running time, having 10X acceleration over Deep Image Prior. With\nthese tools, we empirically verify its capability to defend image\nclassification deep networks against adversary attack algorithms in real-time.\n","authors":["Jianqiao Wangni"],"pdf_url":"https://arxiv.org/pdf/2309.06724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03955v2","updated":"2023-09-14T02:32:48Z","published":"2023-09-07T18:02:57Z","title":"SimpleNeRF: Regularizing Sparse Input Neural Radiance Fields with\n Simpler Solutions","summary":" Neural Radiance Fields (NeRF) show impressive performance for the\nphotorealistic free-view rendering of scenes. However, NeRFs require dense\nsampling of images in the given scene, and their performance degrades\nsignificantly when only a sparse set of views are available. Researchers have\nfound that supervising the depth estimated by the NeRF helps train it\neffectively with fewer views. The depth supervision is obtained either using\nclassical approaches or neural networks pre-trained on a large dataset. While\nthe former may provide only sparse supervision, the latter may suffer from\ngeneralization issues. As opposed to the earlier approaches, we seek to learn\nthe depth supervision by designing augmented models and training them along\nwith the NeRF. We design augmented models that encourage simpler solutions by\nexploring the role of positional encoding and view-dependent radiance in\ntraining the few-shot NeRF. The depth estimated by these simpler models is used\nto supervise the NeRF depth estimates. Since the augmented models can be\ninaccurate in certain regions, we design a mechanism to choose only reliable\ndepth estimates for supervision. Finally, we add a consistency loss between the\ncoarse and fine multi-layer perceptrons of the NeRF to ensure better\nutilization of hierarchical sampling. We achieve state-of-the-art\nview-synthesis performance on two popular datasets by employing the above\nregularizations. The source code for our model can be found on our project\npage: https://nagabhushansn95.github.io/publications/2023/SimpleNeRF.html\n","authors":["Nagabhushan Somraj","Adithyan Karanayil","Rajiv Soundararajan"],"pdf_url":"https://arxiv.org/pdf/2309.03955v2.pdf","comment":"SIGGRAPH Asia 2023"},{"id":"http://arxiv.org/abs/2309.07394v1","updated":"2023-09-14T02:31:18Z","published":"2023-09-14T02:31:18Z","title":"Nucleus-aware Self-supervised Pretraining Using Unpaired Image-to-image\n Translation for Histopathology Images","summary":" Self-supervised pretraining attempts to enhance model performance by\nobtaining effective features from unlabeled data, and has demonstrated its\neffectiveness in the field of histopathology images. Despite its success, few\nworks concentrate on the extraction of nucleus-level information, which is\nessential for pathologic analysis. In this work, we propose a novel\nnucleus-aware self-supervised pretraining framework for histopathology images.\nThe framework aims to capture the nuclear morphology and distribution\ninformation through unpaired image-to-image translation between histopathology\nimages and pseudo mask images. The generation process is modulated by both\nconditional and stochastic style representations, ensuring the reality and\ndiversity of the generated histopathology images for pretraining. Further, an\ninstance segmentation guided strategy is employed to capture instance-level\ninformation. The experiments on 7 datasets show that the proposed pretraining\nmethod outperforms supervised ones on Kather classification, multiple instance\nlearning, and 5 dense-prediction tasks with the transfer learning protocol, and\nyields superior results than other self-supervised approaches on 8\nsemi-supervised tasks. Our project is publicly available at\nhttps://github.com/zhiyuns/UNITPathSSL.\n","authors":["Zhiyun Song","Penghui Du","Junpeng Yan","Kailu Li","Jianzhong Shou","Maode Lai","Yubo Fan","Yan Xu"],"pdf_url":"https://arxiv.org/pdf/2309.07394v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07390v1","updated":"2023-09-14T02:19:38Z","published":"2023-09-14T02:19:38Z","title":"Unleashing the Power of Depth and Pose Estimation Neural Networks by\n Designing Compatible Endoscopic Images","summary":" Deep learning models have witnessed depth and pose estimation framework on\nunannotated datasets as a effective pathway to succeed in endoscopic\nnavigation. Most current techniques are dedicated to developing more advanced\nneural networks to improve the accuracy. However, existing methods ignore the\nspecial properties of endoscopic images, resulting in an inability to fully\nunleash the power of neural networks. In this study, we conduct a detail\nanalysis of the properties of endoscopic images and improve the compatibility\nof images and neural networks, to unleash the power of current neural networks.\nFirst, we introcude the Mask Image Modelling (MIM) module, which inputs partial\nimage information instead of complete image information, allowing the network\nto recover global information from partial pixel information. This enhances the\nnetwork' s ability to perceive global information and alleviates the phenomenon\nof local overfitting in convolutional neural networks due to local artifacts.\nSecond, we propose a lightweight neural network to enhance the endoscopic\nimages, to explicitly improve the compatibility between images and neural\nnetworks. Extensive experiments are conducted on the three public datasets and\none inhouse dataset, and the proposed modules improve baselines by a large\nmargin. Furthermore, the enhanced images we proposed, which have higher network\ncompatibility, can serve as an effective data augmentation method and they are\nable to extract more stable feature points in traditional feature point\nmatching tasks and achieve outstanding performance.\n","authors":["Junyang Wu","Yun Gu"],"pdf_url":"https://arxiv.org/pdf/2309.07390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07387v1","updated":"2023-09-14T02:09:20Z","published":"2023-09-14T02:09:20Z","title":"VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue","summary":" Visually-grounded dialog systems, which integrate multiple modes of\ncommunication such as text and visual inputs, have become an increasingly\npopular area of investigation. However, the absence of a standardized\nevaluation framework poses a challenge in assessing the development of this\nfield. To this end, we propose \\textbf{VDialogUE}, a \\textbf{V}isually-grounded\n\\textbf{Dialog}ue benchmark for \\textbf{U}nified \\textbf{E}valuation. It\ndefines five core multi-modal dialogue tasks and covers six datasets.\nFurthermore, in order to provide a comprehensive assessment of the model's\nperformance across all tasks, we developed a novel evaluation metric called\nVDscore, which is based on the Analytic Hierarchy Process~(AHP) method.\nAdditionally, we present a straightforward yet efficient baseline model, named\n\\textbf{VISIT}~(\\textbf{VIS}ually-grounded d\\textbf{I}alog\n\\textbf{T}ransformer), to promote the advancement of general multi-modal\ndialogue systems. It progressively builds its multi-modal foundation and\ndialogue capability via a two-stage pre-training strategy.\n We believe that the VDialogUE benchmark, along with the evaluation scripts\nand our baseline models, will accelerate the development of visually-grounded\ndialog systems and lead to the development of more sophisticated and effective\npre-trained models.\n","authors":["Yunshui Li","Binyuan Hui","Zhaochao Yin","Wanwei He","Run Luo","Yuxing Long","Min Yang","Fei Huang","Yongbin Li"],"pdf_url":"https://arxiv.org/pdf/2309.07387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.01210v3","updated":"2023-09-14T01:54:51Z","published":"2022-04-04T02:34:26Z","title":"Co-Teaching for Unsupervised Domain Adaptation and Expansion","summary":" Unsupervised Domain Adaptation (UDA) essentially trades a model's performance\non a source domain for improving its performance on a target domain. To resolve\nthe issue, Unsupervised Domain Expansion (UDE) has been proposed recently. UDE\ntries to adapt the model for the target domain as UDA does, and in the meantime\nmaintains its source-domain performance. In both UDA and UDE settings, a model\ntailored to a given domain, let it be the source or the target domain, is\nassumed to well handle samples from the given domain. We question the\nassumption by reporting the existence of cross-domain visual ambiguity: Given\nthe lack of a crystally clear boundary between the two domains, samples from\none domain can be visually close to the other domain. Such sorts of samples are\ntypically in minority in their host domain, so they tend to be overlooked by\nthe domain-specific model, but can be better handled by a model from the other\ndomain. We exploit this finding, and accordingly propose Co-Teaching (CT). The\nCT method is instantiated with knowledge distillation based CT (kdCT) plus\nmixup based CT (miCT). Specifically, kdCT transfers knowledge from a\nleading-teacher network and an assistant-teacher network to a student network,\nso the cross-domain ambiguity will be better handled by the student. Meanwhile,\nmiCT further enhances the generalization ability of the student. Extensive\nexperiments on two image classification datasets and two driving-scene\nsegmentation datasets justify the viability of CT for UDA and UDE.\n","authors":["Kaibin Tian","Qijie Wei","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2204.01210v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06670v2","updated":"2023-09-14T01:46:01Z","published":"2023-09-13T02:15:29Z","title":"ShaDocFormer: A Shadow-attentive Threshold Detector with Cascaded Fusion\n Refiner for document shadow removal","summary":" Document shadow is a common issue that arise when capturing documents using\nmobile devices, which significantly impacts the readability. Current methods\nencounter various challenges including inaccurate detection of shadow masks and\nestimation of illumination. In this paper, we propose ShaDocFormer, a\nTransformer-based architecture that integrates traditional methodologies and\ndeep learning techniques to tackle the problem of document shadow removal. The\nShaDocFormer architecture comprises two components: the Shadow-attentive\nThreshold Detector (STD) and the Cascaded Fusion Refiner (CFR). The STD module\nemploys a traditional thresholding technique and leverages the attention\nmechanism of the Transformer to gather global information, thereby enabling\nprecise detection of shadow masks. The cascaded and aggregative structure of\nthe CFR module facilitates a coarse-to-fine restoration process for the entire\nimage. As a result, ShaDocFormer excels in accurately detecting and capturing\nvariations in both shadow and illumination, thereby enabling effective removal\nof shadows. Extensive experiments demonstrate that ShaDocFormer outperforms\ncurrent state-of-the-art methods in both qualitative and quantitative\nmeasurements.\n","authors":["Weiwen Chen","Shenghong Luo","Xuhang Chen","Zinuo Li","Shuqiang Wang","Chi-Man Pun"],"pdf_url":"https://arxiv.org/pdf/2309.06670v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13469v2","updated":"2023-09-14T01:13:21Z","published":"2023-08-25T16:13:22Z","title":"RestNet: Boosting Cross-Domain Few-Shot Segmentation with Residual\n Transformation Network","summary":" Cross-domain few-shot segmentation (CD-FSS) aims to achieve semantic\nsegmentation in previously unseen domains with a limited number of annotated\nsamples. Although existing CD-FSS models focus on cross-domain feature\ntransformation, relying exclusively on inter-domain knowledge transfer may lead\nto the loss of critical intra-domain information. To this end, we propose a\nnovel residual transformation network (RestNet) that facilitates knowledge\ntransfer while retaining the intra-domain support-query feature information.\nSpecifically, we propose a Semantic Enhanced Anchor Transform (SEAT) module\nthat maps features to a stable domain-agnostic space using advanced semantics.\nAdditionally, an Intra-domain Residual Enhancement (IRE) module is designed to\nmaintain the intra-domain representation of the original discriminant space in\nthe new space. We also propose a mask prediction strategy based on prototype\nfusion to help the model gradually learn how to segment. Our RestNet can\ntransfer cross-domain knowledge from both inter-domain and intra-domain without\nrequiring additional fine-tuning. Extensive experiments on ISIC, Chest X-ray,\nand FSS-1000 show that our RestNet achieves state-of-the-art performance. Our\ncode will be available soon.\n","authors":["Xinyang Huang","Chuang Zhu","Wenkai Chen"],"pdf_url":"https://arxiv.org/pdf/2308.13469v2.pdf","comment":"BMVC 2023"},{"id":"http://arxiv.org/abs/2309.07361v1","updated":"2023-09-14T00:34:11Z","published":"2023-09-14T00:34:11Z","title":"Judging a video by its bitstream cover","summary":" Classifying videos into distinct categories, such as Sport and Music Video,\nis crucial for multimedia understanding and retrieval, especially in an age\nwhere an immense volume of video content is constantly being generated.\nTraditional methods require video decompression to extract pixel-level features\nlike color, texture, and motion, thereby increasing computational and storage\ndemands. Moreover, these methods often suffer from performance degradation in\nlow-quality videos. We present a novel approach that examines only the\npost-compression bitstream of a video to perform classification, eliminating\nthe need for bitstream. We validate our approach using a custom-built data set\ncomprising over 29,000 YouTube video clips, totaling 6,000 hours and spanning\n11 distinct categories. Our preliminary evaluations indicate precision,\naccuracy, and recall rates well over 80%. The algorithm operates approximately\n15,000 times faster than real-time for 30fps videos, outperforming traditional\nDynamic Time Warping (DTW) algorithm by six orders of magnitude.\n","authors":["Yuxing Han","Yunan Ding","Jiangtao Wen","Chen Ye Gan"],"pdf_url":"https://arxiv.org/pdf/2309.07361v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2309.07900v1","updated":"2023-09-14T17:48:34Z","published":"2023-09-14T17:48:34Z","title":"Ambiguity-Aware In-Context Learning with Large Language Models","summary":" In-context learning (ICL) i.e. showing LLMs only a few task-specific\ndemonstrations has led to downstream gains with no task-specific fine-tuning\nrequired. However, LLMs are sensitive to the choice of prompts, and therefore a\ncrucial research question is how to select good demonstrations for ICL. One\neffective strategy is leveraging semantic similarity between the ICL\ndemonstrations and test inputs by using a text retriever, which however is\nsub-optimal as that does not consider the LLM's existing knowledge about that\ntask. From prior work (Min et al., 2022), we already know that labels paired\nwith the demonstrations bias the model predictions. This leads us to our\nhypothesis whether considering LLM's existing knowledge about the task,\nespecially with respect to the output label space can help in a better\ndemonstration selection strategy. Through extensive experimentation on three\ntext classification tasks, we find that it is beneficial to not only choose\nsemantically similar ICL demonstrations but also to choose those demonstrations\nthat help resolve the inherent label ambiguity surrounding the test example.\nInterestingly, we find that including demonstrations that the LLM previously\nmis-classified and also fall on the test example's decision boundary, brings\nthe most performance gain.\n","authors":["Lingyu Gao","Aditi Chaudhary","Krishna Srinivasan","Kazuma Hashimoto","Karthik Raman","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2309.07900v1.pdf","comment":"13 pages in total"},{"id":"http://arxiv.org/abs/2309.07705v1","updated":"2023-09-14T13:31:33Z","published":"2023-09-14T13:31:33Z","title":"NineRec: A Benchmark Dataset Suite for Evaluating Transferable\n Recommendation","summary":" Learning a recommender system model from an item's raw modality features\n(such as image, text, audio, etc.), called MoRec, has attracted growing\ninterest recently. One key advantage of MoRec is that it can easily benefit\nfrom advances in other fields, such as natural language processing (NLP) and\ncomputer vision (CV). Moreover, it naturally supports transfer learning across\ndifferent systems through modality features, known as transferable recommender\nsystems, or TransRec.\n However, so far, TransRec has made little progress, compared to\ngroundbreaking foundation models in the fields of NLP and CV. The lack of\nlarge-scale, high-quality recommendation datasets poses a major obstacle. To\nthis end, we introduce NineRec, a TransRec dataset suite that includes a\nlarge-scale source domain recommendation dataset and nine diverse target domain\nrecommendation datasets. Each item in NineRec is represented by a text\ndescription and a high-resolution cover image. With NineRec, we can implement\nTransRec models in an end-to-end training manner instead of using pre-extracted\ninvariant features. We conduct a benchmark study and empirical analysis of\nTransRec using NineRec, and our findings provide several valuable insights. To\nsupport further research, we make our code, datasets, benchmarks, and\nleaderboards publicly available at\nhttps://github.com/anonymous?ninerec/NineRec.\n","authors":["Jiaqi Zhang","Yu Cheng","Yongxin Ni","Yunzhu Pan","Zheng Yuan","Junchen Fu","Youhua Li","Jie Wang","Fajie Yuan"],"pdf_url":"https://arxiv.org/pdf/2309.07705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07682v1","updated":"2023-09-14T12:55:23Z","published":"2023-09-14T12:55:23Z","title":"A Conversation is Worth A Thousand Recommendations: A Survey of Holistic\n Conversational Recommender Systems","summary":" Conversational recommender systems (CRS) generate recommendations through an\ninteractive process. However, not all CRS approaches use human conversations as\ntheir source of interaction data; the majority of prior CRS work simulates\ninteractions by exchanging entity-level information. As a result, claims of\nprior CRS work do not generalise to real-world settings where conversations\ntake unexpected turns, or where conversational and intent understanding is not\nperfect. To tackle this challenge, the research community has started to\nexamine holistic CRS, which are trained using conversational data collected\nfrom real-world scenarios. Despite their emergence, such holistic approaches\nare under-explored.\n We present a comprehensive survey of holistic CRS methods by summarizing the\nliterature in a structured manner. Our survey recognises holistic CRS\napproaches as having three components: 1) a backbone language model, the\noptional use of 2) external knowledge, and/or 3) external guidance. We also\ngive a detailed analysis of CRS datasets and evaluation methods in real\napplication scenarios. We offer our insight as to the current challenges of\nholistic CRS and possible future trends.\n","authors":["Chuang Li","Hengchang Hu","Yan Zhang","Min-Yen Kan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2309.07682v1.pdf","comment":"Accepted by 5th KaRS Workshop @ ACM RecSys 2023, 8 pages"},{"id":"http://arxiv.org/abs/2309.07610v1","updated":"2023-09-14T11:18:26Z","published":"2023-09-14T11:18:26Z","title":"Feature Engineering in Learning-to-Rank for Community Question Answering\n Task","summary":" Community question answering (CQA) forums are Internet-based platforms where\nusers ask questions about a topic and other expert users try to provide\nsolutions. Many CQA forums such as Quora, Stackoverflow, Yahoo!Answer,\nStackExchange exist with a lot of user-generated data. These data are leveraged\nin automated CQA ranking systems where similar questions (and answers) are\npresented in response to the query of the user. In this work, we empirically\ninvestigate a few aspects of this domain. Firstly, in addition to traditional\nfeatures like TF-IDF, BM25 etc., we introduce a BERT-based feature that\ncaptures the semantic similarity between the question and answer. Secondly,\nmost of the existing research works have focused on features extracted only\nfrom the question part; features extracted from answers have not been explored\nextensively. We combine both types of features in a linear fashion. Thirdly,\nusing our proposed concepts, we conduct an empirical investigation with\ndifferent rank-learning algorithms, some of which have not been used so far in\nCQA domain. On three standard CQA datasets, our proposed framework achieves\nstate-of-the-art performance. We also analyze importance of the features we use\nin our investigation. This work is expected to guide the practitioners to\nselect a better set of features for the CQA retrieval task.\n","authors":["Nafis Sajid","Md Rashidul Hasan","Muhammad Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2309.07610v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2309.07606v1","updated":"2023-09-14T11:13:36Z","published":"2023-09-14T11:13:36Z","title":"Zero-shot Audio Topic Reranking using Large Language Models","summary":" The Multimodal Video Search by Examples (MVSE) project investigates using\nvideo clips as the query term for information retrieval, rather than the more\ntraditional text query. This enables far richer search modalities such as\nimages, speaker, content, topic, and emotion. A key element for this process is\nhighly rapid, flexible, search to support large archives, which in MVSE is\nfacilitated by representing video attributes by embeddings. This work aims to\nmitigate any performance loss from this rapid archive search by examining\nreranking approaches. In particular, zero-shot reranking methods using large\nlanguage models are investigated as these are applicable to any video archive\naudio content. Performance is evaluated for topic-based retrieval on a publicly\navailable video archive, the BBC Rewind corpus. Results demonstrate that\nreranking can achieve improved retrieval ranking without the need for any\ntask-specific training data.\n","authors":["Mengjie Qian","Rao Ma","Adian Liusie","Erfan Loweimi","Kate M. Knill","Mark J. F. Gales"],"pdf_url":"https://arxiv.org/pdf/2309.07606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07602v1","updated":"2023-09-14T11:07:10Z","published":"2023-09-14T11:07:10Z","title":"Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec?","summary":" Recently sequential recommendations and next-item prediction task has become\nincreasingly popular in the field of recommender systems. Currently, two\nstate-of-the-art baselines are Transformer-based models SASRec and BERT4Rec.\nOver the past few years, there have been quite a few publications comparing\nthese two algorithms and proposing new state-of-the-art models. In most of the\npublications, BERT4Rec achieves better performance than SASRec. But BERT4Rec\nuses cross-entropy over softmax for all items, while SASRec uses negative\nsampling and calculates binary cross-entropy loss for one positive and one\nnegative item. In our work, we show that if both models are trained with the\nsame loss, which is used by BERT4Rec, then SASRec will significantly outperform\nBERT4Rec both in terms of quality and training speed. In addition, we show that\nSASRec could be effectively trained with negative sampling and still outperform\nBERT4Rec, but the number of negative examples should be much larger than one.\n","authors":["Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2309.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07597v1","updated":"2023-09-14T10:57:50Z","published":"2023-09-14T10:57:50Z","title":"C-Pack: Packaged Resources To Advance General Chinese Embedding","summary":" We introduce C-Pack, a package of resources that significantly advance the\nfield of general Chinese embeddings. C-Pack includes three critical resources.\n1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6\ntasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated\nfrom labeled and unlabeled Chinese corpora for training embedding models. 3)\nC-TEM is a family of embedding models covering multiple sizes. Our models\noutperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the\ntime of the release. We also integrate and optimize the entire suite of\ntraining methods for C-TEM. Along with our resources on general Chinese\nembedding, we release our data and models for English text embeddings. The\nEnglish models achieve state-of-the-art performance on MTEB benchmark;\nmeanwhile, our released English data is 2 times larger than the Chinese data.\nAll these resources are made publicly available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Niklas Muennighof"],"pdf_url":"https://arxiv.org/pdf/2309.07597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07594v1","updated":"2023-09-14T10:54:48Z","published":"2023-09-14T10:54:48Z","title":"Neuro-Symbolic Recommendation Model based on Logic Query","summary":" A recommendation system assists users in finding items that are relevant to\nthem. Existing recommendation models are primarily based on predicting\nrelationships between users and items and use complex matching models or\nincorporate extensive external information to capture association patterns in\ndata. However, recommendation is not only a problem of inductive statistics\nusing data; it is also a cognitive task of reasoning decisions based on\nknowledge extracted from information. Hence, a logic system could naturally be\nincorporated for the reasoning in a recommendation task. However, although\nhard-rule approaches based on logic systems can provide powerful reasoning\nability, they struggle to cope with inconsistent and incomplete knowledge in\nreal-world tasks, especially for complex tasks such as recommendation.\nTherefore, in this paper, we propose a neuro-symbolic recommendation model,\nwhich transforms the user history interactions into a logic expression and then\ntransforms the recommendation prediction into a query task based on this logic\nexpression. The logic expressions are then computed based on the modular logic\noperations of the neural network. We also construct an implicit logic encoder\nto reasonably reduce the complexity of the logic computation. Finally, a user's\ninterest items can be queried in the vector space based on the computation\nresults. Experiments on three well-known datasets verified that our method\nperforms better compared to state of the art shallow, deep, session, and\nreasoning models.\n","authors":["Maonian Wu","Bang Chen","Shaojun Zhu","Bo Zheng","Wei Peng","Mingyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07594v1.pdf","comment":"17 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.07574v1","updated":"2023-09-14T10:09:11Z","published":"2023-09-14T10:09:11Z","title":"MMEAD: MS MARCO Entity Annotations and Disambiguations","summary":" MMEAD, or MS MARCO Entity Annotations and Disambiguations, is a resource for\nentity links for the MS MARCO datasets. We specify a format to store and share\nlinks for both document and passage collections of MS MARCO. Following this\nspecification, we release entity links to Wikipedia for documents and passages\nin both MS MARCO collections (v1 and v2). Entity links have been produced by\nthe REL and BLINK systems. MMEAD is an easy-to-install Python package, allowing\nusers to load the link data and entity embeddings effortlessly. Using MMEAD\ntakes only a few lines of code. Finally, we show how MMEAD can be used for IR\nresearch that uses entity information. We show how to improve recall@1000 and\nMRR@10 on more complex queries on the MS MARCO v1 passage dataset by using this\nresource. We also demonstrate how entity expansions can be used for interactive\nsearch applications.\n","authors":["Chris Kamphuis","Aileen Lin","Siwen Yang","Jimmy Lin","Arjen P. de Vries","Faegheh Hasibi"],"pdf_url":"https://arxiv.org/pdf/2309.07574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16591v2","updated":"2023-09-14T08:59:03Z","published":"2022-10-29T12:55:12Z","title":"DisenPOI: Disentangling Sequential and Geographical Influence for\n Point-of-Interest Recommendation","summary":" Point-of-Interest (POI) recommendation plays a vital role in various\nlocation-aware services. It has been observed that POI recommendation is driven\nby both sequential and geographical influences. However, since there is no\nannotated label of the dominant influence during recommendation, existing\nmethods tend to entangle these two influences, which may lead to sub-optimal\nrecommendation performance and poor interpretability. In this paper, we address\nthe above challenge by proposing DisenPOI, a novel Disentangled dual-graph\nframework for POI recommendation, which jointly utilizes sequential and\ngeographical relationships on two separate graphs and disentangles the two\ninfluences with self-supervision. The key novelty of our model compared with\nexisting approaches is to extract disentangled representations of both\nsequential and geographical influences with contrastive learning. To be\nspecific, we construct a geographical graph and a sequential graph based on the\ncheck-in sequence of a user. We tailor their propagation schemes to become\nsequence-/geo-aware to better capture the corresponding influences. Preference\nproxies are extracted from check-in sequence as pseudo labels for the two\ninfluences, which supervise the disentanglement via a contrastive loss.\nExtensive experiments on three datasets demonstrate the superiority of the\nproposed model.\n","authors":["Yifang Qin","Yifan Wang","Fang Sun","Wei Ju","Xuyang Hou","Zhe Wang","Jia Cheng","Jun Lei","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.16591v2.pdf","comment":"Accepted by ACM International Conference on Web Search and Data\n Mining (WSDM'23)"},{"id":"http://arxiv.org/abs/2204.12793v3","updated":"2023-09-14T08:50:25Z","published":"2022-04-27T09:26:59Z","title":"Modern Baselines for SPARQL Semantic Parsing","summary":" In this work, we focus on the task of generating SPARQL queries from natural\nlanguage questions, which can then be executed on Knowledge Graphs (KGs). We\nassume that gold entity and relations have been provided, and the remaining\ntask is to arrange them in the right order along with SPARQL vocabulary, and\ninput tokens to produce the correct SPARQL query. Pre-trained Language Models\n(PLMs) have not been explored in depth on this task so far, so we experiment\nwith BART, T5 and PGNs (Pointer Generator Networks) with BERT embeddings,\nlooking for new baselines in the PLM era for this task, on DBpedia and Wikidata\nKGs. We show that T5 requires special input tokenisation, but produces state of\nthe art performance on LC-QuAD 1.0 and LC-QuAD 2.0 datasets, and outperforms\ntask-specific models from previous works. Moreover, the methods enable semantic\nparsing for questions where a part of the input needs to be copied to the\noutput query, thus enabling a new paradigm in KG semantic parsing.\n","authors":["Debayan Banerjee","Pranav Ajit Nair","Jivat Neet Kaur","Ricardo Usbeck","Chris Biemann"],"pdf_url":"https://arxiv.org/pdf/2204.12793v3.pdf","comment":"5 pages, short paper, SIGIR 2022"},{"id":"http://arxiv.org/abs/2210.00305v3","updated":"2023-09-14T07:06:03Z","published":"2022-10-01T16:01:53Z","title":"LambdaKG: A Library for Pre-trained Language Model-Based Knowledge Graph\n Embeddings","summary":" Knowledge Graphs (KGs) often have two characteristics: heterogeneous graph\nstructure and text-rich entity/relation information. Text-based KG embeddings\ncan represent entities by encoding descriptions with pre-trained language\nmodels, but no open-sourced library is specifically designed for KGs with PLMs\nat present. In this paper, we present LambdaKG, a library for KGE that equips\nwith many pre-trained language models (e.g., BERT, BART, T5, GPT-3), and\nsupports various tasks (e.g., knowledge graph completion, question answering,\nrecommendation, and knowledge probing). LambdaKG is publicly open-sourced at\nhttps://github.com/zjunlp/PromptKG/tree/main/lambdaKG, with a demo video at\nhttp://deepke.zjukg.cn/lambdakg.mp4 and long-term maintenance.\n","authors":["Xin Xie","Zhoubo Li","Xiaohan Wang","Zekun Xi","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.00305v3.pdf","comment":"AACL 2023 System Demonstrations, the project website is\n https://zjunlp.github.io/project/promptkg/"},{"id":"http://arxiv.org/abs/2309.04861v2","updated":"2023-09-14T06:05:04Z","published":"2023-09-09T19:01:12Z","title":"Exploring Music Genre Classification: Algorithm Analysis and Deployment\n Architecture","summary":" Music genre classification has become increasingly critical with the advent\nof various streaming applications. Nowadays, we find it impossible to imagine\nusing the artist's name and song title to search for music in a sophisticated\nmusic app. It is always difficult to classify music correctly because the\ninformation linked to music, such as region, artist, album, or non-album, is so\nvariable. This paper presents a study on music genre classification using a\ncombination of Digital Signal Processing (DSP) and Deep Learning (DL)\ntechniques. A novel algorithm is proposed that utilizes both DSP and DL methods\nto extract relevant features from audio signals and classify them into various\ngenres. The algorithm was tested on the GTZAN dataset and achieved high\naccuracy. An end-to-end deployment architecture is also proposed for\nintegration into music-related applications. The performance of the algorithm\nis analyzed and future directions for improvement are discussed. The proposed\nDSP and DL-based music genre classification algorithm and deployment\narchitecture demonstrate a promising approach for music genre classification.\n","authors":["Ayan Biswas","Supriya Dhabal","Palaniandavar Venkateswaran"],"pdf_url":"https://arxiv.org/pdf/2309.04861v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09597v7","updated":"2023-09-14T05:36:15Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":" Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v7.pdf","comment":"ACL 2023, 24 pages, add references of theoretical analysis"},{"id":"http://arxiv.org/abs/2309.04802v2","updated":"2023-09-14T02:31:12Z","published":"2023-09-09T14:07:11Z","title":"CPMR: Context-Aware Incremental Sequential Recommendation with\n Pseudo-Multi-Task Learning","summary":" The motivations of users to make interactions can be divided into static\npreference and dynamic interest. To accurately model user representations over\ntime, recent studies in sequential recommendation utilize information\npropagation and evolution to mine from batches of arriving interactions.\nHowever, they ignore the fact that people are easily influenced by the recent\nactions of other users in the contextual scenario, and applying evolution\nacross all historical interactions dilutes the importance of recent ones, thus\nfailing to model the evolution of dynamic interest accurately. To address this\nissue, we propose a Context-Aware Pseudo-Multi-Task Recommender System (CPMR)\nto model the evolution in both historical and contextual scenarios by creating\nthree representations for each user and item under different dynamics: static\nembedding, historical temporal states, and contextual temporal states. To\ndually improve the performance of temporal states evolution and incremental\nrecommendation, we design a Pseudo-Multi-Task Learning (PMTL) paradigm by\nstacking the incremental single-target recommendations into one multi-target\ntask for joint optimization. Within the PMTL paradigm, CPMR employs a\nshared-bottom network to conduct the evolution of temporal states across\nhistorical and contextual scenarios, as well as the fusion of them at the\nuser-item level. In addition, CPMR incorporates one real tower for incremental\npredictions, and two pseudo towers dedicated to updating the respective\ntemporal states based on new batches of interactions. Experimental results on\nfour benchmark recommendation datasets show that CPMR consistently outperforms\nstate-of-the-art baselines and achieves significant gains on three of them. The\ncode is available at: https://github.com/DiMarzioBian/CPMR.\n","authors":["Qingtian Bian","Jiaxing Xu","Hui Fang","Yiping Ke"],"pdf_url":"https://arxiv.org/pdf/2309.04802v2.pdf","comment":"Accepted by CIKM 2023. Alias: \"Modeling Context-Aware Temporal\n Dynamics via Pseudo-Multi-Task Learning\""},{"id":"http://arxiv.org/abs/2304.07041v4","updated":"2023-09-14T01:35:02Z","published":"2023-04-14T10:29:18Z","title":"A Diffusion model for POI recommendation","summary":" Next Point-of-Interest (POI) recommendation is a critical task in\nlocation-based services that aim to provide personalized suggestions for the\nuser's next destination. Previous works on POI recommendation have laid focused\non modeling the user's spatial preference. However, existing works that\nleverage spatial information are only based on the aggregation of users'\nprevious visited positions, which discourages the model from recommending POIs\nin novel areas. This trait of position-based methods will harm the model's\nperformance in many situations. Additionally, incorporating sequential\ninformation into the user's spatial preference remains a challenge. In this\npaper, we propose Diff-POI: a Diffusion-based model that samples the user's\nspatial preference for the next POI recommendation. Inspired by the wide\napplication of diffusion algorithm in sampling from distributions, Diff-POI\nencodes the user's visiting sequence and spatial character with two\ntailor-designed graph encoding modules, followed by a diffusion-based sampling\nstrategy to explore the user's spatial visiting trends. We leverage the\ndiffusion process and its reversed form to sample from the posterior\ndistribution and optimized the corresponding score function. We design a joint\ntraining and inference framework to optimize and evaluate the proposed\nDiff-POI. Extensive experiments on four real-world POI recommendation datasets\ndemonstrate the superiority of our Diff-POI over state-of-the-art baseline\nmethods. Further ablation and parameter studies on Diff-POI reveal the\nfunctionality and effectiveness of the proposed diffusion-based sampling\nstrategy for addressing the limitations of existing methods.\n","authors":["Yifang Qin","Hongjun Wu","Wei Ju","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.07041v4.pdf","comment":"Accepted by ACM Transactions on Information Systems (TOIS 2023)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2304.10520v2","updated":"2023-09-14T17:57:55Z","published":"2023-04-20T17:51:09Z","title":"Contrastive Tuning: A Little Help to Make Masked Autoencoders Forget","summary":" Masked Image Modeling (MIM) methods, like Masked Autoencoders (MAE),\nefficiently learn a rich representation of the input. However, for adapting to\ndownstream tasks, they require a sufficient amount of labeled data since their\nrich features code not only objects but also less relevant image background. In\ncontrast, Instance Discrimination (ID) methods focus on objects. In this work,\nwe study how to combine the efficiency and scalability of MIM with the ability\nof ID to perform downstream classification in the absence of large amounts of\nlabeled data. To this end, we introduce Masked Autoencoder Contrastive Tuning\n(MAE-CT), a sequential approach that utilizes the implicit clustering of the\nNearest Neighbor Contrastive Learning (NNCLR) objective to induce abstraction\nin the topmost layers of a pre-trained MAE. MAE-CT tunes the rich features such\nthat they form semantic clusters of objects without using any labels. Notably,\nMAE-CT does not rely on hand-crafted augmentations and frequently achieves its\nbest performances while using only minimal augmentations (crop & flip).\nFurther, MAE-CT is compute efficient as it requires at most 10% overhead\ncompared to MAE re-training. Applied to large and huge Vision Transformer (ViT)\nmodels, MAE-CT excels over previous self-supervised methods trained on ImageNet\nin linear probing, k-NN and low-shot classification accuracy as well as in\nunsupervised clustering accuracy. With ViT-H/16 MAE-CT achieves a new\nstate-of-the-art in linear probing of 82.2%.\n","authors":["Johannes Lehner","Benedikt Alkin","Andreas Fürst","Elisabeth Rumetshofer","Lukas Miklautz","Sepp Hochreiter"],"pdf_url":"https://arxiv.org/pdf/2304.10520v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07907v1","updated":"2023-09-14T17:55:18Z","published":"2023-09-14T17:55:18Z","title":"Physically Plausible Full-Body Hand-Object Interaction Synthesis","summary":" We propose a physics-based method for synthesizing dexterous hand-object\ninteractions in a full-body setting. While recent advancements have addressed\nspecific facets of human-object interactions, a comprehensive physics-based\napproach remains a challenge. Existing methods often focus on isolated segments\nof the interaction process and rely on data-driven techniques that may result\nin artifacts. In contrast, our proposed method embraces reinforcement learning\n(RL) and physics simulation to mitigate the limitations of data-driven\napproaches. Through a hierarchical framework, we first learn skill priors for\nboth body and hand movements in a decoupled setting. The generic skill priors\nlearn to decode a latent skill embedding into the motion of the underlying\npart. A high-level policy then controls hand-object interactions in these\npretrained latent spaces, guided by task objectives of grasping and 3D target\ntrajectory following. It is trained using a novel reward function that combines\nan adversarial style term with a task reward, encouraging natural motions while\nfulfilling the task incentives. Our method successfully accomplishes the\ncomplete interaction task, from approaching an object to grasping and\nsubsequent manipulation. We compare our approach against kinematics-based\nbaselines and show that it leads to more physically plausible motions.\n","authors":["Jona Braun","Sammy Christen","Muhammed Kocabas","Emre Aksan","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2309.07907v1.pdf","comment":"Project page at https://eth-ait.github.io/phys-fullbody-grasp"},{"id":"http://arxiv.org/abs/2309.07899v1","updated":"2023-09-14T17:48:30Z","published":"2023-09-14T17:48:30Z","title":"Improving physics-informed DeepONets with hard constraints","summary":" Current physics-informed (standard or operator) neural networks still rely on\naccurately learning the initial conditions of the system they are solving. In\ncontrast, standard numerical methods evolve such initial conditions without\nneeding to learn these. In this study, we propose to improve current\nphysics-informed deep learning strategies such that initial conditions do not\nneed to be learned and are represented exactly in the predicted solution.\nMoreover, this method guarantees that when a DeepONet is applied multiple times\nto time step a solution, the resulting function is continuous.\n","authors":["Rüdiger Brecht","Dmytro R. Popovych","Alex Bihlo","Roman O. Popovych"],"pdf_url":"https://arxiv.org/pdf/2309.07899v1.pdf","comment":"15 pages, 5 figures, 4 tables; release version"},{"id":"http://arxiv.org/abs/2308.08841v2","updated":"2023-09-14T17:45:45Z","published":"2023-08-17T08:00:20Z","title":"Machine Learning-Assisted Discovery of Novel Reactor Designs","summary":" Additive manufacturing has enabled the fabrication of advanced reactor\ngeometries, permitting larger, more complex design spaces. Identifying\npromising configurations within such spaces presents a significant challenge\nfor current approaches. Furthermore, existing parameterisations of reactor\ngeometries are low-dimensional with expensive optimisation limiting more\ncomplex solutions. To address this challenge, we establish a machine\nlearning-assisted approach for the design of the next-generation of chemical\nreactors, combining the application of high-dimensional parameterisations,\ncomputational fluid dynamics, and multi-fidelity Bayesian optimisation. We\nassociate the development of mixing-enhancing vortical flow structures in novel\ncoiled reactors with performance, and use our approach to identify key\ncharacteristics of optimal designs. By appealing to fluid mechanical\nprinciples, we rationalise the selection of novel design features that lead to\nexperimental performance improvements of ~60% over conventional designs. Our\nresults demonstrate that coupling advanced manufacturing techniques with\n`augmented-intelligence' approaches can lead to superior design performance\nand, consequently, emissions-reduction and sustainability.\n","authors":["Tom Savage","Nausheen Basha","Jonathan McDonough","Omar K Matar","Ehecatl Antonio del Rio Chanona"],"pdf_url":"https://arxiv.org/pdf/2308.08841v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.07893v1","updated":"2023-09-14T17:43:02Z","published":"2023-09-14T17:43:02Z","title":"Choosing a Proxy Metric from Past Experiments","summary":" In many randomized experiments, the treatment effect of the long-term metric\n(i.e. the primary outcome of interest) is often difficult or infeasible to\nmeasure. Such long-term metrics are often slow to react to changes and\nsufficiently noisy they are challenging to faithfully estimate in short-horizon\nexperiments. A common alternative is to measure several short-term proxy\nmetrics in the hope they closely track the long-term metric -- so they can be\nused to effectively guide decision-making in the near-term. We introduce a new\nstatistical framework to both define and construct an optimal proxy metric for\nuse in a homogeneous population of randomized experiments. Our procedure first\nreduces the construction of an optimal proxy metric in a given experiment to a\nportfolio optimization problem which depends on the true latent treatment\neffects and noise level of experiment under consideration. We then denoise the\nobserved treatment effects of the long-term metric and a set of proxies in a\nhistorical corpus of randomized experiments to extract estimates of the latent\ntreatment effects for use in the optimization problem. One key insight derived\nfrom our approach is that the optimal proxy metric for a given experiment is\nnot apriori fixed; rather it should depend on the sample size (or effective\nnoise level) of the randomized experiment for which it is deployed. To\ninstantiate and evaluate our framework, we employ our methodology in a large\ncorpus of randomized experiments from an industrial recommendation system and\nconstruct proxy metrics that perform favorably relative to several baselines.\n","authors":["Nilesh Tripuraneni","Lee Richardson","Alexander D'Amour","Jacopo Soriano","Steve Yadlowsky"],"pdf_url":"https://arxiv.org/pdf/2309.07893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07888v1","updated":"2023-09-14T17:40:44Z","published":"2023-09-14T17:40:44Z","title":"A Novel Local-Global Feature Fusion Framework for Body-weight Exercise\n Recognition with Pressure Mapping Sensors","summary":" We present a novel local-global feature fusion framework for body-weight\nexercise recognition with floor-based dynamic pressure maps. One step further\nfrom the existing studies using deep neural networks mainly focusing on global\nfeature extraction, the proposed framework aims to combine local and global\nfeatures using image processing techniques and the YOLO object detection to\nlocalize pressure profiles from different body parts and consider physical\nconstraints. The proposed local feature extraction method generates two sets of\nhigh-level local features consisting of cropped pressure mapping and numerical\nfeatures such as angular orientation, location on the mat, and pressure area.\nIn addition, we adopt a knowledge distillation for regularization to preserve\nthe knowledge of the global feature extraction and improve the performance of\nthe exercise recognition. Our experimental results demonstrate a notable 11\npercent improvement in F1 score for exercise recognition while preserving\nlabel-specific features.\n","authors":["Davinder Pal Singh","Lala Shakti Swarup Ray","Bo Zhou","Sungho Suh","Paul Lukowicz"],"pdf_url":"https://arxiv.org/pdf/2309.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07887v1","updated":"2023-09-14T17:36:53Z","published":"2023-09-14T17:36:53Z","title":"Some notes concerning a generalized KMM-type optimization method for\n density ratio estimation","summary":" In the present paper we introduce new optimization algorithms for the task of\ndensity ratio estimation. More precisely, we consider extending the well-known\nKMM method using the construction of a suitable loss function, in order to\nencompass more general situations involving the estimation of density ratio\nwith respect to subsets of the training data and test data, respectively. The\nassociated codes can be found at https://github.com/CDAlecsa/Generalized-KMM.\n","authors":["Cristian Daniel Alecsa"],"pdf_url":"https://arxiv.org/pdf/2309.07887v1.pdf","comment":"17 pages, 4 figures"},{"id":"http://arxiv.org/abs/2302.13348v2","updated":"2023-09-14T17:31:59Z","published":"2023-02-26T16:44:13Z","title":"Kernel Conditional Moment Constraints for Confounding Robust Inference","summary":" We study policy evaluation of offline contextual bandits subject to\nunobserved confounders. Sensitivity analysis methods are commonly used to\nestimate the policy value under the worst-case confounding over a given\nuncertainty set. However, existing work often resorts to some coarse relaxation\nof the uncertainty set for the sake of tractability, leading to overly\nconservative estimation of the policy value. In this paper, we propose a\ngeneral estimator that provides a sharp lower bound of the policy value. It can\nbe shown that our estimator contains the recently proposed sharp estimator by\nDorn and Guo (2022) as a special case, and our method enables a novel extension\nof the classical marginal sensitivity model using f-divergence. To construct\nour estimator, we leverage the kernel method to obtain a tractable\napproximation to the conditional moment constraints, which traditional\nnon-sharp estimators failed to take into account. In the theoretical analysis,\nwe provide a condition for the choice of the kernel which guarantees no\nspecification error that biases the lower bound estimation. Furthermore, we\nprovide consistency guarantees of policy evaluation and learning. In the\nexperiments with synthetic and real-world data, we demonstrate the\neffectiveness of the proposed method.\n","authors":["Kei Ishikawa","Niao He"],"pdf_url":"https://arxiv.org/pdf/2302.13348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01921v2","updated":"2023-09-14T17:28:52Z","published":"2023-07-17T05:59:34Z","title":"Transferable Graph Neural Fingerprint Models for Quick Response to\n Future Bio-Threats","summary":" Fast screening of drug molecules based on the ligand binding affinity is an\nimportant step in the drug discovery pipeline. Graph neural fingerprint is a\npromising method for developing molecular docking surrogates with high\nthroughput and great fidelity. In this study, we built a COVID-19 drug docking\ndataset of about 300,000 drug candidates on 23 coronavirus protein targets.\nWith this dataset, we trained graph neural fingerprint docking models for\nhigh-throughput virtual COVID-19 drug screening. The graph neural fingerprint\nmodels yield high prediction accuracy on docking scores with the mean squared\nerror lower than $0.21$ kcal/mol for most of the docking targets, showing\nsignificant improvement over conventional circular fingerprint methods. To make\nthe neural fingerprints transferable for unknown targets, we also propose a\ntransferable graph neural fingerprint method trained on multiple targets. With\ncomparable accuracy to target-specific graph neural fingerprint models, the\ntransferable model exhibits superb training and data efficiency. We highlight\nthat the impact of this study extends beyond COVID-19 dataset, as our approach\nfor fast virtual ligand screening can be easily adapted and integrated into a\ngeneral machine learning-accelerated pipeline to battle future bio-threats.\n","authors":["Wei Chen","Yihui Ren","Ai Kagawa","Matthew R. Carbone","Samuel Yen-Chi Chen","Xiaohui Qu","Shinjae Yoo","Austin Clyde","Arvind Ramanathan","Rick L. Stevens","Hubertus J. J. van Dam","Deyu Liu"],"pdf_url":"https://arxiv.org/pdf/2308.01921v2.pdf","comment":"8 pages, 5 figures, 2 tables, accepted by ICLMA2023"},{"id":"http://arxiv.org/abs/2309.07867v1","updated":"2023-09-14T17:14:26Z","published":"2023-09-14T17:14:26Z","title":"Beta Diffusion","summary":" We introduce beta diffusion, a novel generative modeling method that\nintegrates demasking and denoising to generate data within bounded ranges.\nUsing scaled and shifted beta distributions, beta diffusion utilizes\nmultiplicative transitions over time to create both forward and reverse\ndiffusion processes, maintaining beta distributions in both the forward\nmarginals and the reverse conditionals, given the data at any point in time.\nUnlike traditional diffusion-based generative models relying on additive\nGaussian noise and reweighted evidence lower bounds (ELBOs), beta diffusion is\nmultiplicative and optimized with KL-divergence upper bounds (KLUBs) derived\nfrom the convexity of the KL divergence. We demonstrate that the proposed KLUBs\nare more effective for optimizing beta diffusion compared to negative ELBOs,\nwhich can also be derived as the KLUBs of the same KL divergence with its two\narguments swapped. The loss function of beta diffusion, expressed in terms of\nBregman divergence, further supports the efficacy of KLUBs for optimization.\nExperimental results on both synthetic data and natural images demonstrate the\nunique capabilities of beta diffusion in generative modeling of range-bounded\ndata and validate the effectiveness of KLUBs in optimizing diffusion models,\nthereby making them valuable additions to the family of diffusion-based\ngenerative models and the optimization techniques used to train them.\n","authors":["Mingyuan Zhou","Tianqi Chen","Zhendong Wang","Huangjie Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.07867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07860v1","updated":"2023-09-14T17:03:50Z","published":"2023-09-14T17:03:50Z","title":"Identifying the Group-Theoretic Structure of Machine-Learned Symmetries","summary":" Deep learning was recently successfully used in deriving symmetry\ntransformations that preserve important physics quantities. Being completely\nagnostic, these techniques postpone the identification of the discovered\nsymmetries to a later stage. In this letter we propose methods for examining\nand identifying the group-theoretic structure of such machine-learned\nsymmetries. We design loss functions which probe the subalgebra structure\neither during the deep learning stage of symmetry discovery or in a subsequent\npost-processing stage. We illustrate the new methods with examples from the\nU(n) Lie group family, obtaining the respective subalgebra decompositions. As\nan application to particle physics, we demonstrate the identification of the\nresidual symmetries after the spontaneous breaking of non-Abelian gauge\nsymmetries like SU(3) and SU(5) which are commonly used in model building.\n","authors":["Roy T. Forestano","Konstantin T. Matchev","Katia Matcheva","Alexander Roman","Eyup B. Unlu","Sarunas Verner"],"pdf_url":"https://arxiv.org/pdf/2309.07860v1.pdf","comment":"10 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2309.04612v2","updated":"2023-09-14T16:56:50Z","published":"2023-09-08T22:05:27Z","title":"Self-optimizing Feature Generation via Categorical Hashing\n Representation and Hierarchical Reinforcement Crossing","summary":" Feature generation aims to generate new and meaningful features to create a\ndiscriminative representation space.A generated feature is meaningful when the\ngenerated feature is from a feature pair with inherent feature interaction. In\nthe real world, experienced data scientists can identify potentially useful\nfeature-feature interactions, and generate meaningful dimensions from an\nexponentially large search space, in an optimal crossing form over an optimal\ngeneration path. But, machines have limited human-like abilities.We generalize\nsuch learning tasks as self-optimizing feature generation. Self-optimizing\nfeature generation imposes several under-addressed challenges on existing\nsystems: meaningful, robust, and efficient generation. To tackle these\nchallenges, we propose a principled and generic representation-crossing\nframework to solve self-optimizing feature generation.To achieve hashing\nrepresentation, we propose a three-step approach: feature discretization,\nfeature hashing, and descriptive summarization. To achieve reinforcement\ncrossing, we develop a hierarchical reinforcement feature crossing approach.We\npresent extensive experimental results to demonstrate the effectiveness and\nefficiency of the proposed method. The code is available at\nhttps://github.com/yingwangyang/HRC_feature_cross.git.\n","authors":["Wangyang Ying","Dongjie Wang","Kunpeng Liu","Leilei Sun","Yanjie Fu"],"pdf_url":"https://arxiv.org/pdf/2309.04612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11322v3","updated":"2023-09-14T16:37:44Z","published":"2023-05-18T22:11:04Z","title":"SpikeCP: Delay-Adaptive Reliable Spiking Neural Networks via Conformal\n Prediction","summary":" Spiking neural networks (SNNs) process time-series data via internal\nevent-driven neural dynamics whose energy consumption depends on the number of\nspikes exchanged between neurons over the course of the input presentation. In\ntypical implementations of an SNN classifier, decisions are produced after the\nentire input sequence has been processed, resulting in latency and energy\nconsumption levels that are fairly uniform across inputs. Recently introduced\ndelay-adaptive SNNs tailor the inference latency -- and, with it, the energy\nconsumption -- to the difficulty of each example, by producing an early\ndecision when the SNN model is sufficiently ``confident''. In this paper, we\nstart by observing that, as an SNN processes input samples, its classification\ndecisions tend to be first under-confident and then over-confident with respect\nto the decision's ground-truth, unknown, test accuracy. This makes it difficult\nto determine a stopping time that ensures a desired level of accuracy. To\naddress this problem, we introduce a novel delay-adaptive SNN-based inference\nmethodology that, wrapping around any pre-trained SNN classifier, provides\nguaranteed reliability for the decisions produced at input-dependent stopping\ntimes. The approach entails minimal added complexity as compared to the\nunderlying SNN, requiring only thresholding and counting operations at run\ntime, and it leverages tools from conformal prediction (CP).\n","authors":["Jiechen Chen","Sangwoo Park","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2305.11322v3.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2111.01996v2","updated":"2023-09-14T16:37:31Z","published":"2021-11-03T03:28:30Z","title":"Pareto Adversarial Robustness: Balancing Spatial Robustness and\n Sensitivity-based Robustness","summary":" Adversarial robustness, which primarily comprises sensitivity-based\nrobustness and spatial robustness, plays an integral part in achieving robust\ngeneralization. In this paper, we endeavor to design strategies to achieve\nuniversal adversarial robustness. To achieve this, we first investigate the\nrelatively less-explored realm of spatial robustness. Then, we integrate the\nexisting spatial robustness methods by incorporating both local and global\nspatial vulnerability into a unified spatial attack and adversarial training\napproach. Furthermore, we present a comprehensive relationship between natural\naccuracy, sensitivity-based robustness, and spatial robustness, supported by\nstrong evidence from the perspective of robust representation. Crucially, to\nreconcile the interplay between the mutual impacts of various robustness\ncomponents into one unified framework, we incorporate the \\textit{Pareto\ncriterion} into the adversarial robustness analysis, yielding a novel strategy\ncalled Pareto Adversarial Training for achieving universal robustness. The\nresulting Pareto front, which delineates the set of optimal solutions, provides\nan optimal balance between natural accuracy and various adversarial robustness.\nThis sheds light on solutions for achieving universal robustness in the future.\nTo the best of our knowledge, we are the first to consider universal\nadversarial robustness via multi-objective optimization.\n","authors":["Ke Sun","Mingjie Li","Zhouchen Lin"],"pdf_url":"https://arxiv.org/pdf/2111.01996v2.pdf","comment":"Published in SCIENCE CHINA Information Sciences (SCIS) 2023. Please\n also refer to the published version in the Journal reference\n https://www.sciengine.com/SCIS/doi/10.1007/s11432-022-3861-8"},{"id":"http://arxiv.org/abs/2208.06028v2","updated":"2023-09-14T16:37:12Z","published":"2022-08-11T20:17:02Z","title":"Gaussian Process Surrogate Models for Neural Networks","summary":" Not being able to understand and predict the behavior of deep learning\nsystems makes it hard to decide what architecture and algorithm to use for a\ngiven problem. In science and engineering, modeling is a methodology used to\nunderstand complex systems whose internal processes are opaque. Modeling\nreplaces a complex system with a simpler, more interpretable surrogate. Drawing\ninspiration from this, we construct a class of surrogate models for neural\nnetworks using Gaussian processes. Rather than deriving kernels for infinite\nneural networks, we learn kernels empirically from the naturalistic behavior of\nfinite neural networks. We demonstrate our approach captures existing phenomena\nrelated to the spectral bias of neural networks, and then show that our\nsurrogate models can be used to solve practical problems such as identifying\nwhich points most influence the behavior of specific neural networks and\npredicting which architectures and algorithms will generalize well for specific\ndatasets.\n","authors":["Michael Y. Li","Erin Grant","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2208.06028v2.pdf","comment":"Proceedings of UAI 2023"},{"id":"http://arxiv.org/abs/2309.07835v1","updated":"2023-09-14T16:22:14Z","published":"2023-09-14T16:22:14Z","title":"Learning to Warm-Start Fixed-Point Optimization Algorithms","summary":" We introduce a machine-learning framework to warm-start fixed-point\noptimization algorithms. Our architecture consists of a neural network mapping\nproblem parameters to warm starts, followed by a predefined number of\nfixed-point iterations. We propose two loss functions designed to either\nminimize the fixed-point residual or the distance to a ground truth solution.\nIn this way, the neural network predicts warm starts with the end-to-end goal\nof minimizing the downstream loss. An important feature of our architecture is\nits flexibility, in that it can predict a warm start for fixed-point algorithms\nrun for any number of steps, without being limited to the number of steps it\nhas been trained on. We provide PAC-Bayes generalization bounds on unseen data\nfor common classes of fixed-point operators: contractive, linearly convergent,\nand averaged. Applying this framework to well-known applications in control,\nstatistics, and signal processing, we observe a significant reduction in the\nnumber of iterations and solution time required to solve these problems,\nthrough learned warm starts.\n","authors":["Rajiv Sambharya","Georgina Hall","Brandon Amos","Bartolomeo Stellato"],"pdf_url":"https://arxiv.org/pdf/2309.07835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.05735v3","updated":"2023-09-14T16:10:39Z","published":"2023-07-11T19:03:17Z","title":"Effective Latent Differential Equation Models via Attention and Multiple\n Shooting","summary":" Scientific Machine Learning (SciML) is a burgeoning field that\nsynergistically combines domain-aware and interpretable models with agnostic\nmachine learning techniques. In this work, we introduce GOKU-UI, an evolution\nof the SciML generative model GOKU-nets. GOKU-UI not only broadens the original\nmodel's spectrum to incorporate other classes of differential equations, such\nas Stochastic Differential Equations (SDEs), but also integrates attention\nmechanisms and a novel multiple shooting training strategy in the latent space.\nThese modifications have led to a significant increase in its performance in\nboth reconstruction and forecast tasks, as demonstrated by our evaluation of\nsimulated and empirical data. Specifically, GOKU-UI outperformed all baseline\nmodels on synthetic datasets even with a training set 16-fold smaller,\nunderscoring its remarkable data efficiency. Furthermore, when applied to\nempirical human brain data, while incorporating stochastic Stuart-Landau\noscillators into its dynamical core, our proposed enhancements markedly\nincreased the model's effectiveness in capturing complex brain dynamics. This\naugmented version not only surpassed all baseline methods in the reconstruction\ntask, but also demonstrated lower prediction error of future brain activity up\nto 15 seconds ahead. By training GOKU-UI on resting state fMRI data, we encoded\nwhole-brain dynamics into a latent representation, learning a low-dimensional\ndynamical system model that could offer insights into brain functionality and\nopen avenues for practical applications such as the classification of mental\nstates or psychiatric conditions. Ultimately, our research provides further\nimpetus for the field of Scientific Machine Learning, showcasing the potential\nfor advancements when established scientific insights are interwoven with\nmodern machine learning.\n","authors":["Germán Abrevaya","Mahta Ramezanian-Panahi","Jean-Christophe Gagnon-Audet","Pablo Polosecki","Irina Rish","Silvina Ponce Dawson","Guillermo Cecchi","Guillaume Dumas"],"pdf_url":"https://arxiv.org/pdf/2307.05735v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11110v2","updated":"2023-09-14T16:08:10Z","published":"2022-05-23T07:58:50Z","title":"Meta-Learning Regrasping Strategies for Physical-Agnostic Objects","summary":" Grasping inhomogeneous objects in real-world applications remains a\nchallenging task due to the unknown physical properties such as mass\ndistribution and coefficient of friction. In this study, we propose a\nmeta-learning algorithm called ConDex, which incorporates Conditional Neural\nProcesses (CNP) with DexNet-2.0 to autonomously discern the underlying physical\nproperties of objects using depth images. ConDex efficiently acquires physical\nembeddings from limited trials, enabling precise grasping point estimation.\nFurthermore, ConDex is capable of updating the predicted grasping quality\niteratively from new trials in an online fashion. To the best of our knowledge,\nwe are the first who generate two object datasets focusing on inhomogeneous\nphysical properties with varying mass distributions and friction coefficients.\nExtensive evaluations in simulation demonstrate ConDex's superior performance\nover DexNet-2.0 and existing meta-learning-based grasping pipelines.\nFurthermore, ConDex shows robust generalization to previously unseen real-world\nobjects despite training solely in the simulation. The synthetic and real-world\ndatasets will be published as well.\n","authors":["Ning Gao","Jingyu Zhang","Ruijie Chen","Ngo Anh Vien","Hanna Ziesche","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2205.11110v2.pdf","comment":"Accepted as spotlight in ICRA 2022 Workshop: Scaling Robot Learning"},{"id":"http://arxiv.org/abs/2304.01029v2","updated":"2023-09-14T16:05:46Z","published":"2023-04-03T14:28:29Z","title":"Domain Generalization for Crop Segmentation with Knowledge Distillation","summary":" In recent years, precision agriculture has gradually oriented farming closer\nto automation processes to support all the activities related to field\nmanagement. Service robotics plays a predominant role in this evolution by\ndeploying autonomous agents that can navigate fields while performing tasks\nwithout human intervention, such as monitoring, spraying, and harvesting. To\nexecute these precise actions, mobile robots need a real-time perception system\nthat understands their surroundings and identifies their targets in the wild.\nGeneralizing to new crops and environmental conditions is critical for\npractical applications, as labeled samples are rarely available. In this paper,\nwe investigate the problem of crop segmentation and propose a novel approach to\nenhance domain generalization using knowledge distillation. In the proposed\nframework, we transfer knowledge from an ensemble of models individually\ntrained on source domains to a student model that can adapt to unseen target\ndomains. To evaluate the proposed method, we present a synthetic multi-domain\ndataset for crop segmentation containing plants of variegate shapes and\ncovering different terrain styles, weather conditions, and light scenarios for\nmore than 50,000 samples. We demonstrate significant improvements in\nperformance over state-of-the-art methods and superior sim-to-real\ngeneralization. Our approach provides a promising solution for domain\ngeneralization in crop segmentation and has the potential to enhance a wide\nvariety of precision agriculture applications.\n","authors":["Simone Angarano","Mauro Martini","Alessandro Navone","Marcello Chiaberge"],"pdf_url":"https://arxiv.org/pdf/2304.01029v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.08447v2","updated":"2023-09-14T16:01:06Z","published":"2023-03-15T08:42:48Z","title":"MAHTM: A Multi-Agent Framework for Hierarchical Transactive Microgrids","summary":" Integrating variable renewable energy into the grid has posed challenges to\nsystem operators in achieving optimal trade-offs among energy availability,\ncost affordability, and pollution controllability. This paper proposes a\nmulti-agent reinforcement learning framework for managing energy transactions\nin microgrids. The framework addresses the challenges above: it seeks to\noptimize the usage of available resources by minimizing the carbon footprint\nwhile benefiting all stakeholders. The proposed architecture consists of three\nlayers of agents, each pursuing different objectives. The first layer,\ncomprised of prosumers and consumers, minimizes the total energy cost. The\nother two layers control the energy price to decrease the carbon impact while\nbalancing the consumption and production of both renewable and conventional\nenergy. This framework also takes into account fluctuations in energy demand\nand supply.\n","authors":["Nicolas Cuadrado","Roberto Gutierrez","Yongli Zhu","Martin Takac"],"pdf_url":"https://arxiv.org/pdf/2303.08447v2.pdf","comment":"ICLR 2023 Workshop: Tackling Climate Change with Machine Learning"},{"id":"http://arxiv.org/abs/2309.07813v1","updated":"2023-09-14T15:59:23Z","published":"2023-09-14T15:59:23Z","title":"Directed Scattering for Knowledge Graph-based Cellular Signaling\n Analysis","summary":" Directed graphs are a natural model for many phenomena, in particular\nscientific knowledge graphs such as molecular interaction or chemical reaction\nnetworks that define cellular signaling relationships. In these situations,\nsource nodes typically have distinct biophysical properties from sinks. Due to\ntheir ordered and unidirectional relationships, many such networks also have\nhierarchical and multiscale structure. However, the majority of methods\nperforming node- and edge-level tasks in machine learning do not take these\nproperties into account, and thus have not been leveraged effectively for\nscientific tasks such as cellular signaling network inference. We propose a new\nframework called Directed Scattering Autoencoder (DSAE) which uses a directed\nversion of a geometric scattering transform, combined with the non-linear\ndimensionality reduction properties of an autoencoder and the geometric\nproperties of the hyperbolic space to learn latent hierarchies. We show this\nmethod outperforms numerous others on tasks such as embedding directed graphs\nand learning cellular signaling networks.\n","authors":["Aarthi Venkat","Joyce Chew","Ferran Cardoso Rodriguez","Christopher J. Tape","Michael Perlmutter","Smita Krishnaswamy"],"pdf_url":"https://arxiv.org/pdf/2309.07813v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.07812v1","updated":"2023-09-14T15:59:16Z","published":"2023-09-14T15:59:16Z","title":"Text Classification of Cancer Clinical Trial Eligibility Criteria","summary":" Automatic identification of clinical trials for which a patient is eligible\nis complicated by the fact that trial eligibility is stated in natural\nlanguage. A potential solution to this problem is to employ text classification\nmethods for common types of eligibility criteria. In this study, we focus on\nseven common exclusion criteria in cancer trials: prior malignancy, human\nimmunodeficiency virus, hepatitis B, hepatitis C, psychiatric illness,\ndrug/substance abuse, and autoimmune illness. Our dataset consists of 764 phase\nIII cancer trials with these exclusions annotated at the trial level. We\nexperiment with common transformer models as well as a new pre-trained clinical\ntrial BERT model. Our results demonstrate the feasibility of automatically\nclassifying common exclusion criteria. Additionally, we demonstrate the value\nof a pre-trained language model specifically for clinical trials, which yields\nthe highest average performance across all criteria.\n","authors":["Yumeng Yang","Soumya Jayaraj","Ethan B Ludmir","Kirk Roberts"],"pdf_url":"https://arxiv.org/pdf/2309.07812v1.pdf","comment":"AMIA Annual Symposium Proceedings 2023"},{"id":"http://arxiv.org/abs/2309.07809v1","updated":"2023-09-14T15:55:58Z","published":"2023-09-14T15:55:58Z","title":"Communication Efficient Private Federated Learning Using Dithering","summary":" The task of preserving privacy while ensuring efficient communication is a\nfundamental challenge in federated learning. In this work, we tackle this\nchallenge in the trusted aggregator model, and propose a solution that achieves\nboth objectives simultaneously. We show that employing a quantization scheme\nbased on subtractive dithering at the clients can effectively replicate the\nnormal noise addition process at the aggregator. This implies that we can\nguarantee the same level of differential privacy against other clients while\nsubstantially reducing the amount of communication required, as opposed to\ntransmitting full precision gradients and using central noise addition. We also\nexperimentally demonstrate that the accuracy of our proposed approach matches\nthat of the full precision gradient method.\n","authors":["Burak Hasircioglu","Deniz Gunduz"],"pdf_url":"https://arxiv.org/pdf/2309.07809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07808v1","updated":"2023-09-14T15:54:56Z","published":"2023-09-14T15:54:56Z","title":"What Matters to Enhance Traffic Rule Compliance of Imitation Learning\n for Automated Driving","summary":" More research attention has recently been given to end-to-end autonomous\ndriving technologies where the entire driving pipeline is replaced with a\nsingle neural network because of its simpler structure and faster inference\ntime. Despite this appealing approach largely reducing the components in\ndriving pipeline, its simplicity also leads to interpretability problems and\nsafety issues arXiv:2003.06404. The trained policy is not always compliant with\nthe traffic rules and it is also hard to discover the reason for the\nmisbehavior because of the lack of intermediate outputs. Meanwhile, Sensors are\nalso critical to autonomous driving's security and feasibility to perceive the\nsurrounding environment under complex driving scenarios. In this paper, we\nproposed P-CSG, a novel penalty-based imitation learning approach with cross\nsemantics generation sensor fusion technologies to increase the overall\nperformance of End-to-End Autonomous Driving. We conducted an assessment of our\nmodel's performance using the Town 05 Long benchmark, achieving an impressive\ndriving score improvement of over 15%. Furthermore, we conducted robustness\nevaluations against adversarial attacks like FGSM and Dot attacks, revealing a\nsubstantial increase in robustness compared to baseline models.More detailed\ninformation, such as code-based resources, ablation studies and videos can be\nfound at https://hk-zh.github.io/p-csg-plus.\n","authors":["Hongkuan Zhou","Aifen Sui","Wei Cao","Letian Shi"],"pdf_url":"https://arxiv.org/pdf/2309.07808v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2212.04953v2","updated":"2023-09-14T15:42:54Z","published":"2022-12-09T16:03:34Z","title":"TargetCall: Eliminating the Wasted Computation in Basecalling via\n Pre-Basecalling Filtering","summary":" Basecalling is an essential step in nanopore sequencing analysis where the\nraw signals of nanopore sequencers are converted into nucleotide sequences,\ni.e., reads. State-of-the-art basecallers employ complex deep learning models\nto achieve high basecalling accuracy. This makes basecalling\ncomputationally-inefficient and memory-hungry; bottlenecking the entire genome\nanalysis pipeline. However, for many applications, the majority of reads do no\nmatch the reference genome of interest (i.e., target reference) and thus are\ndiscarded in later steps in the genomics pipeline, wasting the basecalling\ncomputation. To overcome this issue, we propose TargetCall, the first\npre-basecalling filter to eliminate the wasted computation in basecalling.\nTargetCall's key idea is to discard reads that will not match the target\nreference (i.e., off-target reads) prior to basecalling. TargetCall consists of\ntwo main components: (1) LightCall, a lightweight neural network basecaller\nthat produces noisy reads; and (2) Similarity Check, which labels each of these\nnoisy reads as on-target or off-target by matching them to the target\nreference. TargetCall aims to filter out all off-target reads before\nbasecalling. The highly-accurate but slow basecalling is performed only on the\nraw signals whose noisy reads are labeled as on-target. Our thorough\nexperimental evaluations using both real and simulated data show that\nTargetCall 1) improves the end-to-end basecalling performance while maintaining\nhigh sensitivity in keeping on-target reads, 2) maintains high accuracy in\ndownstream analysis, 3) precisely filters out up to 94.71% of off-target reads,\nand 4) achieves better performance, throughput, sensitivity, precision, and\ngenerality compared to prior works. We open-source TargetCall at\nhttps://github.com/CMU-SAFARI/TargetCall\n","authors":["Meryem Banu Cavlak","Gagandeep Singh","Mohammed Alser","Can Firtina","Joël Lindegger","Mohammad Sadrosadati","Nika Mansouri Ghiasi","Can Alkan","Onur Mutlu"],"pdf_url":"https://arxiv.org/pdf/2212.04953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07794v1","updated":"2023-09-14T15:30:59Z","published":"2023-09-14T15:30:59Z","title":"Improving Multimodal Classification of Social Media Posts by Leveraging\n Image-Text Auxiliary tasks","summary":" Effectively leveraging multimodal information from social media posts is\nessential to various downstream tasks such as sentiment analysis, sarcasm\ndetection and hate speech classification. However, combining text and image\ninformation is challenging because of the idiosyncratic cross-modal semantics\nwith hidden or complementary information present in matching image-text pairs.\nIn this work, we aim to directly model this by proposing the use of two\nauxiliary losses jointly with the main task when fine-tuning any pre-trained\nmultimodal model. Image-Text Contrastive (ITC) brings image-text\nrepresentations of a post closer together and separates them from different\nposts, capturing underlying dependencies. Image-Text Matching (ITM) facilitates\nthe understanding of semantic correspondence between images and text by\npenalizing unrelated pairs. We combine these objectives with five multimodal\nmodels, demonstrating consistent improvements across four popular social media\ndatasets. Furthermore, through detailed analysis, we shed light on the specific\nscenarios and cases where each auxiliary task proves to be most effective.\n","authors":["Danae Sánchez Villegas","Daniel Preoţiuc-Pietro","Nikolaos Aletras"],"pdf_url":"https://arxiv.org/pdf/2309.07794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07260v5","updated":"2023-09-14T15:10:01Z","published":"2023-02-14T18:55:21Z","title":"Scalable Bayesian optimization with high-dimensional outputs using\n randomized prior networks","summary":" Several fundamental problems in science and engineering consist of global\noptimization tasks involving unknown high-dimensional (black-box) functions\nthat map a set of controllable variables to the outcomes of an expensive\nexperiment. Bayesian Optimization (BO) techniques are known to be effective in\ntackling global optimization problems using a relatively small number objective\nfunction evaluations, but their performance suffers when dealing with\nhigh-dimensional outputs. To overcome the major challenge of dimensionality,\nhere we propose a deep learning framework for BO and sequential decision making\nbased on bootstrapped ensembles of neural architectures with randomized priors.\nUsing appropriate architecture choices, we show that the proposed framework can\napproximate functional relationships between design variables and quantities of\ninterest, even in cases where the latter take values in high-dimensional vector\nspaces or even infinite-dimensional function spaces. In the context of BO, we\naugmented the proposed probabilistic surrogates with re-parameterized Monte\nCarlo approximations of multiple-point (parallel) acquisition functions, as\nwell as methodological extensions for accommodating black-box constraints and\nmulti-fidelity information sources. We test the proposed framework against\nstate-of-the-art methods for BO and demonstrate superior performance across\nseveral challenging tasks with high-dimensional outputs, including a\nconstrained multi-fidelity optimization task involving shape optimization of\nrotor blades in turbo-machinery.\n","authors":["Mohamed Aziz Bhouri","Michael Joly","Robert Yu","Soumalya Sarkar","Paris Perdikaris"],"pdf_url":"https://arxiv.org/pdf/2302.07260v5.pdf","comment":"23 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.07778v1","updated":"2023-09-14T15:09:35Z","published":"2023-09-14T15:09:35Z","title":"Virchow: A Million-Slide Digital Pathology Foundation Model","summary":" Computational pathology uses artificial intelligence to enable precision\nmedicine and decision support systems through the analysis of whole slide\nimages. It has the potential to revolutionize the diagnosis and treatment of\ncancer. However, a major challenge to this objective is that for many specific\ncomputational pathology tasks the amount of data is inadequate for development.\nTo address this challenge, we created Virchow, a 632 million parameter deep\nneural network foundation model for computational pathology. Using\nself-supervised learning, Virchow is trained on 1.5 million hematoxylin and\neosin stained whole slide images from diverse tissue groups, which is orders of\nmagnitude more data than previous works. When evaluated on downstream tasks\nincluding tile-level pan-cancer detection and subtyping and slide-level\nbiomarker prediction, Virchow outperforms state-of-the-art systems both on\ninternal datasets drawn from the same population as the pretraining data as\nwell as external public datasets. Virchow achieves 93% balanced accuracy for\npancancer tile classification, and AUCs of 0.983 for colon microsatellite\ninstability status prediction and 0.967 for breast CDH1 status prediction. The\ngains in performance highlight the importance of pretraining on massive\npathology image datasets, suggesting pretraining on even larger datasets could\ncontinue improving performance for many high-impact applications where limited\namounts of training data are available, such as drug outcome prediction.\n","authors":["Eugene Vorontsov","Alican Bozkurt","Adam Casson","George Shaikovski","Michal Zelechowski","Siqi Liu","Philippe Mathieu","Alexander van Eck","Donghun Lee","Julian Viret","Eric Robert","Yi Kan Wang","Jeremy D. Kun","Matthew C. H. Le","Jan Bernhard","Ran A. Godrich","Gerard Oakley","Ewan Millar","Matthew Hanna","Juan Retamero","William A. Moye","Razik Yousfi","Christopher Kanan","David Klimstra","Brandon Rothrock","Thomas J. Fuchs"],"pdf_url":"https://arxiv.org/pdf/2309.07778v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07770v1","updated":"2023-09-14T14:59:58Z","published":"2023-09-14T14:59:58Z","title":"Variational Quantum Linear Solver enhanced Quantum Support Vector\n Machine","summary":" Quantum Support Vector Machines (QSVM) play a vital role in using quantum\nresources for supervised machine learning tasks, such as classification.\nHowever, current methods are strongly limited in terms of scalability on Noisy\nIntermediate Scale Quantum (NISQ) devices. In this work, we propose a novel\napproach called the Variational Quantum Linear Solver (VQLS) enhanced QSVM.\nThis is built upon our idea of utilizing the variational quantum linear solver\nto solve system of linear equations of a least squares-SVM on a NISQ device.\nThe implementation of our approach is evaluated by an extensive series of\nnumerical experiments with the Iris dataset, which consists of three distinct\niris plant species. Based on this, we explore the practicality and\neffectiveness of our algorithm by constructing a classifier capable of\nclassification in a feature space ranging from one to seven dimensions.\nFurthermore, by strategically exploiting both classical and quantum computing\nfor various subroutines of our algorithm, we effectively mitigate practical\nchallenges associated with the implementation. These include significant\nimprovement in the trainability of the variational ansatz and notable\nreductions in run-time for cost calculations. Based on the numerical\nexperiments, our approach exhibits the capability of identifying a separating\nhyperplane in an 8-dimensional feature space. Moreover, it consistently\ndemonstrated strong performance across various instances with the same dataset.\n","authors":["Jianming Yi","Kalyani Suresh","Ali Moghiseh","Norbert Wehn"],"pdf_url":"https://arxiv.org/pdf/2309.07770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13049v4","updated":"2023-09-14T14:54:04Z","published":"2022-08-27T16:19:26Z","title":"TrojViT: Trojan Insertion in Vision Transformers","summary":" Vision Transformers (ViTs) have demonstrated the state-of-the-art performance\nin various vision-related tasks. The success of ViTs motivates adversaries to\nperform backdoor attacks on ViTs. Although the vulnerability of traditional\nCNNs to backdoor attacks is well-known, backdoor attacks on ViTs are\nseldom-studied. Compared to CNNs capturing pixel-wise local features by\nconvolutions, ViTs extract global context information through patches and\nattentions. Na\\\"ively transplanting CNN-specific backdoor attacks to ViTs\nyields only a low clean data accuracy and a low attack success rate. In this\npaper, we propose a stealth and practical ViT-specific backdoor attack\n$TrojViT$. Rather than an area-wise trigger used by CNN-specific backdoor\nattacks, TrojViT generates a patch-wise trigger designed to build a Trojan\ncomposed of some vulnerable bits on the parameters of a ViT stored in DRAM\nmemory through patch salience ranking and attention-target loss. TrojViT\nfurther uses minimum-tuned parameter update to reduce the bit number of the\nTrojan. Once the attacker inserts the Trojan into the ViT model by flipping the\nvulnerable bits, the ViT model still produces normal inference accuracy with\nbenign inputs. But when the attacker embeds a trigger into an input, the ViT\nmodel is forced to classify the input to a predefined target class. We show\nthat flipping only few vulnerable bits identified by TrojViT on a ViT model\nusing the well-known RowHammer can transform the model into a backdoored one.\nWe perform extensive experiments of multiple datasets on various ViT models.\nTrojViT can classify $99.64\\%$ of test images to a target class by flipping\n$345$ bits on a ViT for ImageNet.Our codes are available at\nhttps://github.com/mxzheng/TrojViT\n","authors":["Mengxin Zheng","Qian Lou","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2208.13049v4.pdf","comment":"10 pages, 4 figures, 11 tables"},{"id":"http://arxiv.org/abs/2309.07760v1","updated":"2023-09-14T14:48:01Z","published":"2023-09-14T14:48:01Z","title":"PRE: Vision-Language Prompt Learning with Reparameterization Encoder","summary":" Large pre-trained vision-language models such as CLIP have demonstrated great\npotential in zero-shot transferability to downstream tasks. However, to attain\noptimal performance, the manual selection of prompts is necessary to improve\nalignment between the downstream image distribution and the textual class\ndescriptions. This manual prompt engineering is the major challenge for\ndeploying such models in practice since it requires domain expertise and is\nextremely time-consuming. To avoid non-trivial prompt engineering, recent work\nContext Optimization (CoOp) introduced the concept of prompt learning to the\nvision domain using learnable textual tokens. While CoOp can achieve\nsubstantial improvements over manual prompts, its learned context is worse\ngeneralizable to wider unseen classes within the same dataset. In this work, we\npresent Prompt Learning with Reparameterization Encoder (PRE) - a simple and\nefficient method that enhances the generalization ability of the learnable\nprompt to unseen classes while maintaining the capacity to learn Base classes.\nInstead of directly optimizing the prompts, PRE employs a prompt encoder to\nreparameterize the input prompt embeddings, enhancing the exploration of\ntask-specific knowledge from few-shot samples. Experiments and extensive\nablation studies on 8 benchmarks demonstrate that our approach is an efficient\nmethod for prompt learning. Specifically, PRE achieves a notable enhancement of\n5.60% in average accuracy on New classes and 3% in Harmonic mean compared to\nCoOp in the 16-shot setting, all achieved within a good training time.\n","authors":["Anh Pham Thi Minh"],"pdf_url":"https://arxiv.org/pdf/2309.07760v1.pdf","comment":"8 pages excluding References and Appendix"},{"id":"http://arxiv.org/abs/2208.00085v3","updated":"2023-09-14T14:45:30Z","published":"2022-07-29T21:56:59Z","title":"Machine Learning and Computer Vision Techniques in Continuous Beehive\n Monitoring Applications: A survey","summary":" Wide use and availability of the machine learning and computer vision\ntechniques allows development of relatively complex monitoring systems in many\ndomains. Besides the traditional industrial domain, new application appears\nalso in biology and agriculture, where we could speak about the detection of\ninfections, parasites and weeds, but also about automated monitoring and early\nwarning systems. This is also connected with the introduction of the easily\naccessible hardware and development kits such as Arduino, or RaspberryPi\nfamily. In this paper, we survey 50 existing papers focusing on the methods of\nautomated beehive monitoring methods using the computer vision techniques,\nparticularly on the pollen and Varroa mite detection together with the bee\ntraffic monitoring. Such systems could also be used for the monitoring of the\nhoneybee colonies and for the inspection of their health state, which could\nidentify potentially dangerous states before the situation is critical, or to\nbetter plan periodic bee colony inspections and therefore save significant\ncosts. Later, we also include analysis of the research trends in this\napplication field and we outline the possible direction of the new\nexplorations. Our paper is aimed also at veterinary and apidology professionals\nand experts, who might not be familiar with machine learning to introduce them\nto its possibilities, therefore each family of applications is opened by a\nbrief theoretical introduction and motivation related to its base method. We\nhope that this paper will inspire other scientists to use machine learning\ntechniques for other applications in beehive monitoring.\n","authors":["Simon Bilik","Tomas Zemcik","Lukas Kratochvila","Dominik Ricanek","Milos Richter","Sebastian Zambanini","Karel Horak"],"pdf_url":"https://arxiv.org/pdf/2208.00085v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07742v1","updated":"2023-09-14T14:26:20Z","published":"2023-09-14T14:26:20Z","title":"Interpretability is in the Mind of the Beholder: A Causal Framework for\n Human-interpretable Representation Learning","summary":" Focus in Explainable AI is shifting from explanations defined in terms of\nlow-level elements, such as input features, to explanations encoded in terms of\ninterpretable concepts learned from data. How to reliably acquire such concepts\nis, however, still fundamentally unclear. An agreed-upon notion of concept\ninterpretability is missing, with the result that concepts used by both\npost-hoc explainers and concept-based neural networks are acquired through a\nvariety of mutually incompatible strategies. Critically, most of these neglect\nthe human side of the problem: a representation is understandable only insofar\nas it can be understood by the human at the receiving end. The key challenge in\nHuman-interpretable Representation Learning (HRL) is how to model and\noperationalize this human element. In this work, we propose a mathematical\nframework for acquiring interpretable representations suitable for both\npost-hoc explainers and concept-based neural networks. Our formalization of HRL\nbuilds on recent advances in causal representation learning and explicitly\nmodels a human stakeholder as an external observer. This allows us to derive a\nprincipled notion of alignment between the machine representation and the\nvocabulary of concepts understood by the human. In doing so, we link alignment\nand interpretability through a simple and intuitive name transfer game, and\nclarify the relationship between alignment and a well-known property of\nrepresentations, namely disentanglment. We also show that alignment is linked\nto the issue of undesirable correlations among concepts, also known as concept\nleakage, and to content-style separation, all through a general\ninformation-theoretic reformulation of these properties. Our conceptualization\naims to bridge the gap between the human and algorithmic sides of\ninterpretability and establish a stepping stone for new research on\nhuman-interpretable representations.\n","authors":["Emanuele Marconato","Andrea Passerini","Stefano Teso"],"pdf_url":"https://arxiv.org/pdf/2309.07742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00923v2","updated":"2023-09-14T14:05:02Z","published":"2023-09-02T12:07:21Z","title":"GBE-MLZSL: A Group Bi-Enhancement Framework for Multi-Label Zero-Shot\n Learning","summary":" This paper investigates a challenging problem of zero-shot learning in the\nmulti-label scenario (MLZSL), wherein, the model is trained to recognize\nmultiple unseen classes within a sample (e.g., an image) based on seen classes\nand auxiliary knowledge, e.g., semantic information. Existing methods usually\nresort to analyzing the relationship of various seen classes residing in a\nsample from the dimension of spatial or semantic characteristics, and transfer\nthe learned model to unseen ones. But they ignore the effective integration of\nlocal and global features. That is, in the process of inferring unseen classes,\nglobal features represent the principal direction of the image in the feature\nspace, while local features should maintain uniqueness within a certain range.\nThis integrated neglect will make the model lose its grasp of the main\ncomponents of the image. Relying only on the local existence of seen classes\nduring the inference stage introduces unavoidable bias. In this paper, we\npropose a novel and effective group bi-enhancement framework for MLZSL, dubbed\nGBE-MLZSL, to fully make use of such properties and enable a more accurate and\nrobust visual-semantic projection. Specifically, we split the feature maps into\nseveral feature groups, of which each feature group can be trained\nindependently with the Local Information Distinguishing Module (LID) to ensure\nuniqueness. Meanwhile, a Global Enhancement Module (GEM) is designed to\npreserve the principal direction. Besides, a static graph structure is designed\nto construct the correlation of local features. Experiments on large-scale\nMLZSL benchmark datasets NUS-WIDE and Open-Images-v4 demonstrate that the\nproposed GBE-MLZSL outperforms other state-of-the-art methods with large\nmargins.\n","authors":["Ziming Liu","Jingcai Guo","Xiaocheng Lu","Song Guo","Peiran Dong","Jiewei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.00923v2.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.07716v1","updated":"2023-09-14T13:48:16Z","published":"2023-09-14T13:48:16Z","title":"Understanding Vector-Valued Neural Networks and Their Relationship with\n Real and Hypercomplex-Valued Neural Networks","summary":" Despite the many successful applications of deep learning models for\nmultidimensional signal and image processing, most traditional neural networks\nprocess data represented by (multidimensional) arrays of real numbers. The\nintercorrelation between feature channels is usually expected to be learned\nfrom the training data, requiring numerous parameters and careful training. In\ncontrast, vector-valued neural networks are conceived to process arrays of\nvectors and naturally consider the intercorrelation between feature channels.\nConsequently, they usually have fewer parameters and often undergo more robust\ntraining than traditional neural networks. This paper aims to present a broad\nframework for vector-valued neural networks, referred to as V-nets. In this\ncontext, hypercomplex-valued neural networks are regarded as vector-valued\nmodels with additional algebraic properties. Furthermore, this paper explains\nthe relationship between vector-valued and traditional neural networks.\nPrecisely, a vector-valued neural network can be obtained by placing\nrestrictions on a real-valued model to consider the intercorrelation between\nfeature channels. Finally, we show how V-nets, including hypercomplex-valued\nneural networks, can be implemented in current deep-learning libraries as\nreal-valued networks.\n","authors":["Marcos Eduardo Valle"],"pdf_url":"https://arxiv.org/pdf/2309.07716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07708v1","updated":"2023-09-14T13:42:27Z","published":"2023-09-14T13:42:27Z","title":"Market-GAN: Adding Control to Financial Market Data Generation with\n Semantic Context","summary":" Financial simulators play an important role in enhancing forecasting\naccuracy, managing risks, and fostering strategic financial decision-making.\nDespite the development of financial market simulation methodologies, existing\nframeworks often struggle with adapting to specialized simulation context. We\npinpoint the challenges as i) current financial datasets do not contain context\nlabels; ii) current techniques are not designed to generate financial data with\ncontext as control, which demands greater precision compared to other\nmodalities; iii) the inherent difficulties in generating context-aligned,\nhigh-fidelity data given the non-stationary, noisy nature of financial data. To\naddress these challenges, our contributions are: i) we proposed the Contextual\nMarket Dataset with market dynamics, stock ticker, and history state as\ncontext, leveraging a market dynamics modeling method that combines linear\nregression and Dynamic Time Warping clustering to extract market dynamics; ii)\nwe present Market-GAN, a novel architecture incorporating a Generative\nAdversarial Networks (GAN) for the controllable generation with context, an\nautoencoder for learning low-dimension features, and supervisors for knowledge\ntransfer; iii) we introduce a two-stage training scheme to ensure that\nMarket-GAN captures the intrinsic market distribution with multiple objectives.\nIn the pertaining stage, with the use of the autoencoder and supervisors, we\nprepare the generator with a better initialization for the adversarial training\nstage. We propose a set of holistic evaluation metrics that consider alignment,\nfidelity, data usability on downstream tasks, and market facts. We evaluate\nMarket-GAN with the Dow Jones Industrial Average data from 2000 to 2023 and\nshowcase superior performance in comparison to 4 state-of-the-art time-series\ngenerative models.\n","authors":["Haochong Xia","Shuo Sun","Xinrun Wang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2309.07708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10848v2","updated":"2023-09-14T13:29:40Z","published":"2023-02-21T17:59:10Z","title":"Deep reinforced learning heuristic tested on spin-glass ground states:\n The larger picture","summary":" In Changjun Fan et al. [Nature Communications\nhttps://doi.org/10.1038/s41467-023-36363-w (2023)], the authors present a deep\nreinforced learning approach to augment combinatorial optimization heuristics.\nIn particular, they present results for several spin glass ground state\nproblems, for which instances on non-planar networks are generally NP-hard, in\ncomparison with several Monte Carlo based methods, such as simulated annealing\n(SA) or parallel tempering (PT). Indeed, those results demonstrate that the\nreinforced learning improves the results over those obtained with SA or PT, or\nat least allows for reduced runtimes for the heuristics before results of\ncomparable quality have been obtained relative to those other methods. To\nfacilitate the conclusion that their method is ''superior'', the authors pursue\ntwo basic strategies: (1) A commercial GUROBI solver is called on to procure a\nsample of exact ground states as a testbed to compare with, and (2) a\nhead-to-head comparison between the heuristics is given for a sample of larger\ninstances where exact ground states are hard to ascertain. Here, we put these\nstudies into a larger context, showing that the claimed superiority is at best\nmarginal for smaller samples and becomes essentially irrelevant with respect to\nany sensible approximation of true ground states in the larger samples. For\nexample, this method becomes irrelevant as a means to determine stiffness\nexponents $\\theta$ in $d>2$, as mentioned by the authors, where the problem is\nnot only NP-hard but requires the subtraction of two almost equal ground-state\nenergies and systemic errors in each of $\\approx 1\\%$ found here are\nunacceptable. This larger picture on the method arises from a straightforward\nfinite-size corrections study over the spin glass ensembles the authors employ,\nusing data that has been available for decades.\n","authors":["Stefan Boettcher"],"pdf_url":"https://arxiv.org/pdf/2302.10848v2.pdf","comment":"5 pages, 2 figures, comment on arXiv:2109.14411, related information\n can be found at https://physics.emory.edu/faculty/boettcher/"},{"id":"http://arxiv.org/abs/2309.07703v1","updated":"2023-09-14T13:25:42Z","published":"2023-09-14T13:25:42Z","title":"Causal Entropy and Information Gain for Measuring Causal Control","summary":" Artificial intelligence models and methods commonly lack causal\ninterpretability. Despite the advancements in interpretable machine learning\n(IML) methods, they frequently assign importance to features which lack causal\ninfluence on the outcome variable. Selecting causally relevant features among\nthose identified as relevant by these methods, or even before model training,\nwould offer a solution. Feature selection methods utilizing information\ntheoretical quantities have been successful in identifying statistically\nrelevant features. However, the information theoretical quantities they are\nbased on do not incorporate causality, rendering them unsuitable for such\nscenarios. To address this challenge, this article proposes information\ntheoretical quantities that incorporate the causal structure of the system,\nwhich can be used to evaluate causal importance of features for some given\noutcome variable. Specifically, we introduce causal versions of entropy and\nmutual information, termed causal entropy and causal information gain, which\nare designed to assess how much control a feature provides over the outcome\nvariable. These newly defined quantities capture changes in the entropy of a\nvariable resulting from interventions on other variables. Fundamental results\nconnecting these quantities to the existence of causal effects are derived. The\nuse of causal information gain in feature selection is demonstrated,\nhighlighting its superiority over standard mutual information in revealing\nwhich features provide control over a chosen outcome variable. Our\ninvestigation paves the way for the development of methods with improved\ninterpretability in domains involving causation.\n","authors":["Francisco Nunes Ferreira Quialheiro Simoes","Mehdi Dastani","Thijs van Ommen"],"pdf_url":"https://arxiv.org/pdf/2309.07703v1.pdf","comment":"16 pages. Accepted at the third XI-ML workshop of ECAI 2023. To\n appear in the Springer CCIS book series"},{"id":"http://arxiv.org/abs/2206.03420v3","updated":"2023-09-14T13:15:56Z","published":"2022-06-07T16:12:17Z","title":"An Adaptive Federated Relevance Framework for Spatial Temporal Graph\n Learning","summary":" Spatial-temporal data contains rich information and has been widely studied\nin recent years due to the rapid development of relevant applications in many\nfields. For instance, medical institutions often use electrodes attached to\ndifferent parts of a patient to analyse the electorencephal data rich with\nspatial and temporal features for health assessment and disease diagnosis.\nExisting research has mainly used deep learning techniques such as\nconvolutional neural network (CNN) or recurrent neural network (RNN) to extract\nhidden spatial-temporal features. Yet, it is challenging to incorporate both\ninter-dependencies spatial information and dynamic temporal changes\nsimultaneously. In reality, for a model that leverages these spatial-temporal\nfeatures to fulfil complex prediction tasks, it often requires a colossal\namount of training data in order to obtain satisfactory model performance.\nConsidering the above-mentioned challenges, we propose an adaptive federated\nrelevance framework, namely FedRel, for spatial-temporal graph learning in this\npaper. After transforming the raw spatial-temporal data into high quality\nfeatures, the core Dynamic Inter-Intra Graph (DIIG) module in the framework is\nable to use these features to generate the spatial-temporal graphs capable of\ncapturing the hidden topological and long-term temporal correlation information\nin these graphs. To improve the model generalization ability and performance\nwhile preserving the local data privacy, we also design a relevance-driven\nfederated learning module in our framework to leverage diverse data\ndistributions from different participants with attentive aggregations of their\nmodels.\n","authors":["Tiehua Zhang","Yuze Liu","Zhishu Shen","Rui Xu","Xin Chen","Xiaowei Huang","Xi Zheng"],"pdf_url":"https://arxiv.org/pdf/2206.03420v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07694v1","updated":"2023-09-14T13:14:51Z","published":"2023-09-14T13:14:51Z","title":"Tree of Uncertain Thoughts Reasoning for Large Language Models","summary":" While the recently introduced Tree of Thoughts (ToT) has heralded\nadvancements in allowing Large Language Models (LLMs) to reason through\nforesight and backtracking for global decision-making, it has overlooked the\ninherent local uncertainties in intermediate decision points or \"thoughts\".\nThese local uncertainties, intrinsic to LLMs given their potential for diverse\nresponses, remain a significant concern in the reasoning process. Addressing\nthis pivotal gap, we introduce the Tree of Uncertain Thoughts (TouT) - a\nreasoning framework tailored for LLMs. Our TouT effectively leverages Monte\nCarlo Dropout to quantify uncertainty scores associated with LLMs' diverse\nlocal responses at these intermediate steps. By marrying this local uncertainty\nquantification with global search algorithms, TouT enhances the model's\nprecision in response generation. We substantiate our approach with rigorous\nexperiments on two demanding planning tasks: Game of 24 and Mini Crosswords.\nThe empirical evidence underscores TouT's superiority over both ToT and\nchain-of-thought prompting methods.\n","authors":["Shentong Mo","Miao Xin"],"pdf_url":"https://arxiv.org/pdf/2309.07694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16874v2","updated":"2023-09-14T13:08:52Z","published":"2022-03-31T08:06:29Z","title":"An Optimal Control Method to Compute the Most Likely Transition Path for\n Stochastic Dynamical Systems with Jumps","summary":" Many complex real world phenomena exhibit abrupt, intermittent or jumping\nbehaviors, which are more suitable to be described by stochastic differential\nequations under non-Gaussian L\\'evy noise. Among these complex phenomena, the\nmost likely transition paths between metastable states are important since\nthese rare events may have a high impact in certain scenarios. Based on the\nlarge deviation principle, the most likely transition path could be treated as\nthe minimizer of the rate function upon paths that connect two points. One of\nthe challenges to calculate the most likely transition path for stochastic\ndynamical systems under non-Gaussian L\\'evy noise is that the associated rate\nfunction can not be explicitly expressed by paths. For this reason, we\nformulate an optimal control problem to obtain the optimal state as the most\nlikely transition path. We then develop a neural network method to solve this\nissue. Several experiments are investigated for both Gaussian and non-Gaussian\ncases.\n","authors":["Wei Wei","Ting Gao","Jinqiao Duan","Xiaoli Chen"],"pdf_url":"https://arxiv.org/pdf/2203.16874v2.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2309.07690v1","updated":"2023-09-14T13:07:36Z","published":"2023-09-14T13:07:36Z","title":"A DenseNet-based method for decoding auditory spatial attention with EEG","summary":" Auditory spatial attention detection (ASAD) aims to decode the attended\nspatial location with EEG in a multiple-speaker setting. ASAD methods are\ninspired by the brain lateralization of cortical neural responses during the\nprocessing of auditory spatial attention, and show promising performance for\nthe task of auditory attention decoding (AAD) with neural recordings. In the\nprevious ASAD methods, the spatial distribution of EEG electrodes is not fully\nexploited, which may limit the performance of these methods. In the present\nwork, by transforming the original EEG channels into a two-dimensional (2D)\nspatial topological map, the EEG data is transformed into a three-dimensional\n(3D) arrangement containing spatial-temporal information. And then a 3D deep\nconvolutional neural network (DenseNet-3D) is used to extract temporal and\nspatial features of the neural representation for the attended locations. The\nresults show that the proposed method achieves higher decoding accuracy than\nthe state-of-the-art (SOTA) method (94.4% compared to XANet's 90.6%) with\n1-second decision window for the widely used KULeuven (KUL) dataset, and the\ncode to implement our work is available on Github:\n https://github.com/xuxiran/ASAD_DenseNet\n","authors":["Xiran Xu","Bo Wang","Yujie Yan","Xihong Wu","Jing Chen"],"pdf_url":"https://arxiv.org/pdf/2309.07690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07684v1","updated":"2023-09-14T12:58:40Z","published":"2023-09-14T12:58:40Z","title":"deepFDEnet: A Novel Neural Network Architecture for Solving Fractional\n Differential Equations","summary":" The primary goal of this research is to propose a novel architecture for a\ndeep neural network that can solve fractional differential equations\naccurately. A Gaussian integration rule and a $L_1$ discretization technique\nare used in the proposed design. In each equation, a deep neural network is\nused to approximate the unknown function. Three forms of fractional\ndifferential equations have been examined to highlight the method's\nversatility: a fractional ordinary differential equation, a fractional order\nintegrodifferential equation, and a fractional order partial differential\nequation. The results show that the proposed architecture solves different\nforms of fractional differential equations with excellent precision.\n","authors":["Ali Nosrati Firoozsalari","Hassan Dana Mazraeh","Alireza Afzal Aghaei","Kourosh Parand"],"pdf_url":"https://arxiv.org/pdf/2309.07684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07679v1","updated":"2023-09-14T12:45:20Z","published":"2023-09-14T12:45:20Z","title":"Benchmarking machine learning models for quantum state classification","summary":" Quantum computing is a growing field where the information is processed by\ntwo-levels quantum states known as qubits. Current physical realizations of\nqubits require a careful calibration, composed by different experiments, due to\nnoise and decoherence phenomena. Among the different characterization\nexperiments, a crucial step is to develop a model to classify the measured\nstate by discriminating the ground state from the excited state. In this\nproceedings we benchmark multiple classification techniques applied to real\nquantum devices.\n","authors":["Edoardo Pedicillo","Andrea Pasquale","Stefano Carrazza"],"pdf_url":"https://arxiv.org/pdf/2309.07679v1.pdf","comment":"9 pages, 3 figures, CHEP2023 proceedings"},{"id":"http://arxiv.org/abs/2309.07675v1","updated":"2023-09-14T12:39:26Z","published":"2023-09-14T12:39:26Z","title":"Goal Space Abstraction in Hierarchical Reinforcement Learning via\n Set-Based Reachability Analysis","summary":" Open-ended learning benefits immensely from the use of symbolic methods for\ngoal representation as they offer ways to structure knowledge for efficient and\ntransferable learning. However, the existing Hierarchical Reinforcement\nLearning (HRL) approaches relying on symbolic reasoning are often limited as\nthey require a manual goal representation. The challenge in autonomously\ndiscovering a symbolic goal representation is that it must preserve critical\ninformation, such as the environment dynamics. In this paper, we propose a\ndevelopmental mechanism for goal discovery via an emergent representation that\nabstracts (i.e., groups together) sets of environment states that have similar\nroles in the task. We introduce a Feudal HRL algorithm that concurrently learns\nboth the goal representation and a hierarchical policy. The algorithm uses\nsymbolic reachability analysis for neural networks to approximate the\ntransition relation among sets of states and to refine the goal representation.\nWe evaluate our approach on complex navigation tasks, showing the learned\nrepresentation is interpretable, transferrable and results in data efficient\nlearning.\n","authors":["Mehdi Zadem","Sergio Mover","Sao Mai Nguyen"],"pdf_url":"https://arxiv.org/pdf/2309.07675v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.12539v3","updated":"2023-09-14T12:34:51Z","published":"2021-10-24T22:15:01Z","title":"Discrete Acoustic Space for an Efficient Sampling in Neural\n Text-To-Speech","summary":" We present a Split Vector Quantized Variational Autoencoder (SVQ-VAE)\narchitecture using a split vector quantizer for NTTS, as an enhancement to the\nwell-known Variational Autoencoder (VAE) and Vector Quantized Variational\nAutoencoder (VQ-VAE) architectures. Compared to these previous architectures,\nour proposed model retains the benefits of using an utterance-level bottleneck,\nwhile keeping significant representation power and a discretized latent space\nsmall enough for efficient prediction from text. We train the model on\nrecordings in the expressive task-oriented dialogues domain and show that\nSVQ-VAE achieves a statistically significant improvement in naturalness over\nthe VAE and VQ-VAE models. Furthermore, we demonstrate that the SVQ-VAE latent\nacoustic space is predictable from text, reducing the gap between the standard\nconstant vector synthesis and vocoded recordings by 32%.\n","authors":["Marek Strong","Jonas Rohnke","Antonio Bonafonte","Mateusz Łajszczak","Trevor Wood"],"pdf_url":"https://arxiv.org/pdf/2110.12539v3.pdf","comment":"5 pages, 5 figures, accepted at IberSPEECH 2022"},{"id":"http://arxiv.org/abs/2309.07672v1","updated":"2023-09-14T12:34:42Z","published":"2023-09-14T12:34:42Z","title":"Physics-constrained robust learning of open-form PDEs from limited and\n noisy data","summary":" Unveiling the underlying governing equations of nonlinear dynamic systems\nremains a significant challenge, especially when encountering noisy\nobservations and no prior knowledge available. This study proposes R-DISCOVER,\na framework designed to robustly uncover open-form partial differential\nequations (PDEs) from limited and noisy data. The framework operates through\ntwo alternating update processes: discovering and embedding. The discovering\nphase employs symbolic representation and a reinforcement learning (RL)-guided\nhybrid PDE generator to efficiently produce diverse open-form PDEs with tree\nstructures. A neural network-based predictive model fits the system response\nand serves as the reward evaluator for the generated PDEs. PDEs with superior\nfits are utilized to iteratively optimize the generator via the RL method and\nthe best-performing PDE is selected by a parameter-free stability metric. The\nembedding phase integrates the initially identified PDE from the discovering\nprocess as a physical constraint into the predictive model for robust training.\nThe traversal of PDE trees automates the construction of the computational\ngraph and the embedding process without human intervention. Numerical\nexperiments demonstrate our framework's capability to uncover governing\nequations from nonlinear dynamic systems with limited and highly noisy data and\noutperform other physics-informed neural network-based discovery methods. This\nwork opens new potential for exploring real-world systems with limited\nunderstanding.\n","authors":["Mengge Du","Longfeng Nie","Siyu Lou","Yuntian Chenc","Dongxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.07672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07670v1","updated":"2023-09-14T12:34:22Z","published":"2023-09-14T12:34:22Z","title":"Federated Dataset Dictionary Learning for Multi-Source Domain Adaptation","summary":" In this article, we propose an approach for federated domain adaptation, a\nsetting where distributional shift exists among clients and some have unlabeled\ndata. The proposed framework, FedDaDiL, tackles the resulting challenge through\ndictionary learning of empirical distributions. In our setting, clients'\ndistributions represent particular domains, and FedDaDiL collectively trains a\nfederated dictionary of empirical distributions. In particular, we build upon\nthe Dataset Dictionary Learning framework by designing collaborative\ncommunication protocols and aggregation operations. The chosen protocols keep\nclients' data private, thus enhancing overall privacy compared to its\ncentralized counterpart. We empirically demonstrate that our approach\nsuccessfully generates labeled data on the target domain with extensive\nexperiments on (i) Caltech-Office, (ii) TEP, and (iii) CWRU benchmarks.\nFurthermore, we compare our method to its centralized counterpart and other\nbenchmarks in federated domain adaptation.\n","authors":["Fabiola Espinosa Castellon","Eduardo Fernandes Montesuma","Fred Ngolè Mboula","Aurélien Mayoue","Antoine Souloumiac","Cédric Gouy-Pallier"],"pdf_url":"https://arxiv.org/pdf/2309.07670v1.pdf","comment":"7 pages,2 figures"},{"id":"http://arxiv.org/abs/2309.07666v1","updated":"2023-09-14T12:29:41Z","published":"2023-09-14T12:29:41Z","title":"Multi-Source Domain Adaptation meets Dataset Distillation through\n Dataset Dictionary Learning","summary":" In this paper, we consider the intersection of two problems in machine\nlearning: Multi-Source Domain Adaptation (MSDA) and Dataset Distillation (DD).\nOn the one hand, the first considers adapting multiple heterogeneous labeled\nsource domains to an unlabeled target domain. On the other hand, the second\nattacks the problem of synthesizing a small summary containing all the\ninformation about the datasets. We thus consider a new problem called MSDA-DD.\nTo solve it, we adapt previous works in the MSDA literature, such as\nWasserstein Barycenter Transport and Dataset Dictionary Learning, as well as DD\nmethod Distribution Matching. We thoroughly experiment with this novel problem\non four benchmarks (Caltech-Office 10, Tennessee-Eastman Process, Continuous\nStirred Tank Reactor, and Case Western Reserve University), where we show that,\neven with as little as 1 sample per class, one achieves state-of-the-art\nadaptation performance.\n","authors":["Eduardo Fernandes Montesuma","Fred Ngolè Mboula","Antoine Souloumiac"],"pdf_url":"https://arxiv.org/pdf/2309.07666v1.pdf","comment":"7 pages,4 figures"},{"id":"http://arxiv.org/abs/2309.07663v1","updated":"2023-09-14T12:27:17Z","published":"2023-09-14T12:27:17Z","title":"Dataset Size Dependence of Rate-Distortion Curve and Threshold of\n Posterior Collapse in Linear VAE","summary":" In the Variational Autoencoder (VAE), the variational posterior often aligns\nclosely with the prior, which is known as posterior collapse and hinders the\nquality of representation learning. To mitigate this problem, an adjustable\nhyperparameter beta has been introduced in the VAE. This paper presents a\nclosed-form expression to assess the relationship between the beta in VAE, the\ndataset size, the posterior collapse, and the rate-distortion curve by\nanalyzing a minimal VAE in a high-dimensional limit. These results clarify that\na long plateau in the generalization error emerges with a relatively larger\nbeta. As the beta increases, the length of the plateau extends and then becomes\ninfinite beyond a certain beta threshold. This implies that the choice of beta,\nunlike the usual regularization parameters, can induce posterior collapse\nregardless of the dataset size. Thus, beta is a risky parameter that requires\ncareful tuning. Furthermore, considering the dataset-size dependence on the\nrate-distortion curve, a relatively large dataset is required to obtain a\nrate-distortion curve with high rates. Extensive numerical experiments support\nour analysis.\n","authors":["Yuma Ichikawa","Koji Hukushima"],"pdf_url":"https://arxiv.org/pdf/2309.07663v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2304.12876v2","updated":"2023-09-14T12:13:59Z","published":"2023-04-25T14:48:58Z","title":"Evaluation of Parameter-based Attacks against Embedded Neural Networks\n with Laser Injection","summary":" Upcoming certification actions related to the security of machine learning\n(ML) based systems raise major evaluation challenges that are amplified by the\nlarge-scale deployment of models in many hardware platforms. Until recently,\nmost of research works focused on API-based attacks that consider a ML model as\na pure algorithmic abstraction. However, new implementation-based threats have\nbeen revealed, emphasizing the urgency to propose both practical and\nsimulation-based methods to properly evaluate the robustness of models. A major\nconcern is parameter-based attacks (such as the Bit-Flip Attack, BFA) that\nhighlight the lack of robustness of typical deep neural network models when\nconfronted by accurate and optimal alterations of their internal parameters\nstored in memory. Setting in a security testing purpose, this work practically\nreports, for the first time, a successful variant of the BFA on a 32-bit\nCortex-M microcontroller using laser fault injection. It is a standard fault\ninjection means for security evaluation, that enables to inject spatially and\ntemporally accurate faults. To avoid unrealistic brute-force strategies, we\nshow how simulations help selecting the most sensitive set of bits from the\nparameters taking into account the laser fault model.\n","authors":["Mathieu Dumont","Kevin Hector","Pierre-Alain Moellic","Jean-Max Dutertre","Simon Pontié"],"pdf_url":"https://arxiv.org/pdf/2304.12876v2.pdf","comment":"Accepted at 42nd International Conference on Computer Safety,\n Reliability and Security, SafeComp 2023"},{"id":"http://arxiv.org/abs/2306.15679v2","updated":"2023-09-14T12:07:40Z","published":"2023-06-19T15:35:19Z","title":"Generating Parametric BRDFs from Natural Language Descriptions","summary":" Artistic authoring of 3D environments is a laborious enterprise that also\nrequires skilled content creators. There have been impressive improvements in\nusing machine learning to address different aspects of generating 3D content,\nsuch as generating meshes, arranging geometry, synthesizing textures, etc. In\nthis paper we develop a model to generate Bidirectional Reflectance\nDistribution Functions (BRDFs) from descriptive textual prompts. BRDFs are four\ndimensional probability distributions that characterize the interaction of\nlight with surface materials. They are either represented parametrically, or by\ntabulating the probability density associated with every pair of incident and\noutgoing angles. The former lends itself to artistic editing while the latter\nis used when measuring the appearance of real materials. Numerous works have\nfocused on hypothesizing BRDF models from images of materials. We learn a\nmapping from textual descriptions of materials to parametric BRDFs. Our model\nis first trained using a semi-supervised approach before being tuned via an\nunsupervised scheme. Although our model is general, in this paper we\nspecifically generate parameters for MDL materials, conditioned on natural\nlanguage descriptions, within NVIDIA's Omniverse platform. This enables use\ncases such as real-time text prompts to change materials of objects in 3D\nenvironments such as \"dull plastic\" or \"shiny iron\". Since the output of our\nmodel is a parametric BRDF, rather than an image of the material, it may be\nused to render materials using any shape under arbitrarily specified viewing\nand lighting conditions.\n","authors":["Sean Memery","Osmar Cedron","Kartic Subr"],"pdf_url":"https://arxiv.org/pdf/2306.15679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.01726v2","updated":"2023-09-14T11:51:06Z","published":"2020-09-03T15:04:27Z","title":"Survival Estimation for Missing not at Random Censoring Indicators based\n on Copula Models","summary":" In the presence of right-censored data with covariates, the conditional\nKaplan-Meier estimator (also known as the Beran estimator) consistently\nestimates the conditional survival function of the random follow-up for the\nevent of interest. However, a necessary condition is the unambiguous knowledge\nof whether each individual is censored or not, which may be incomplete in\npractice. We therefore propose a study of the Beran estimator when the\ncensoring indicators are generic random variables and discuss necessary\nconditions for the efficiency of the Beran estimator. From this, we provide a\nnew estimator for the conditional survival function with missing not at random\n(MNAR) censoring indicators based on a conditional copula model for the\nmissingness mechanism. In addition to the theoretical results, we illustrate\nhow the estimators work for small samples through a simulation study and show\ntheir practical applicability by analyzing synthetic and real data.\n","authors":["Mikael Escobar-Bach","Olivier Goudet"],"pdf_url":"https://arxiv.org/pdf/2009.01726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07610v1","updated":"2023-09-14T11:18:26Z","published":"2023-09-14T11:18:26Z","title":"Feature Engineering in Learning-to-Rank for Community Question Answering\n Task","summary":" Community question answering (CQA) forums are Internet-based platforms where\nusers ask questions about a topic and other expert users try to provide\nsolutions. Many CQA forums such as Quora, Stackoverflow, Yahoo!Answer,\nStackExchange exist with a lot of user-generated data. These data are leveraged\nin automated CQA ranking systems where similar questions (and answers) are\npresented in response to the query of the user. In this work, we empirically\ninvestigate a few aspects of this domain. Firstly, in addition to traditional\nfeatures like TF-IDF, BM25 etc., we introduce a BERT-based feature that\ncaptures the semantic similarity between the question and answer. Secondly,\nmost of the existing research works have focused on features extracted only\nfrom the question part; features extracted from answers have not been explored\nextensively. We combine both types of features in a linear fashion. Thirdly,\nusing our proposed concepts, we conduct an empirical investigation with\ndifferent rank-learning algorithms, some of which have not been used so far in\nCQA domain. On three standard CQA datasets, our proposed framework achieves\nstate-of-the-art performance. We also analyze importance of the features we use\nin our investigation. This work is expected to guide the practitioners to\nselect a better set of features for the CQA retrieval task.\n","authors":["Nafis Sajid","Md Rashidul Hasan","Muhammad Ibrahim"],"pdf_url":"https://arxiv.org/pdf/2309.07610v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2309.07609v1","updated":"2023-09-14T11:17:43Z","published":"2023-09-14T11:17:43Z","title":"Learning Quasi-Static 3D Models of Markerless Deformable Linear Objects\n for Bimanual Robotic Manipulation","summary":" The robotic manipulation of Deformable Linear Objects (DLOs) is a vital and\nchallenging task that is important in many practical applications. Classical\nmodel-based approaches to this problem require an accurate model to capture how\nrobot motions affect the deformation of the DLO. Nowadays, data-driven models\noffer the best tradeoff between quality and computation time. This paper\nanalyzes several learning-based 3D models of the DLO and proposes a new one\nbased on the Transformer architecture that achieves superior accuracy, even on\nthe DLOs of different lengths, thanks to the proposed scaling method. Moreover,\nwe introduce a data augmentation technique, which improves the prediction\nperformance of almost all considered DLO data-driven models. Thanks to this\ntechnique, even a simple Multilayer Perceptron (MLP) achieves close to\nstate-of-the-art performance while being significantly faster to evaluate. In\nthe experiments, we compare the performance of the learning-based 3D models of\nthe DLO on several challenging datasets quantitatively and demonstrate their\napplicability in the task of shaping a DLO.\n","authors":["Piotr Kicki","Michał Bidziński","Krzysztof Walas"],"pdf_url":"https://arxiv.org/pdf/2309.07609v1.pdf","comment":"Under review for IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2309.07602v1","updated":"2023-09-14T11:07:10Z","published":"2023-09-14T11:07:10Z","title":"Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec?","summary":" Recently sequential recommendations and next-item prediction task has become\nincreasingly popular in the field of recommender systems. Currently, two\nstate-of-the-art baselines are Transformer-based models SASRec and BERT4Rec.\nOver the past few years, there have been quite a few publications comparing\nthese two algorithms and proposing new state-of-the-art models. In most of the\npublications, BERT4Rec achieves better performance than SASRec. But BERT4Rec\nuses cross-entropy over softmax for all items, while SASRec uses negative\nsampling and calculates binary cross-entropy loss for one positive and one\nnegative item. In our work, we show that if both models are trained with the\nsame loss, which is used by BERT4Rec, then SASRec will significantly outperform\nBERT4Rec both in terms of quality and training speed. In addition, we show that\nSASRec could be effectively trained with negative sampling and still outperform\nBERT4Rec, but the number of negative examples should be much larger than one.\n","authors":["Anton Klenitskiy","Alexey Vasilev"],"pdf_url":"https://arxiv.org/pdf/2309.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07601v1","updated":"2023-09-14T11:06:51Z","published":"2023-09-14T11:06:51Z","title":"Detecting Misinformation with LLM-Predicted Credibility Signals and Weak\n Supervision","summary":" Credibility signals represent a wide range of heuristics that are typically\nused by journalists and fact-checkers to assess the veracity of online content.\nAutomating the task of credibility signal extraction, however, is very\nchallenging as it requires high-accuracy signal-specific extractors to be\ntrained, while there are currently no sufficiently large datasets annotated\nwith all credibility signals. This paper investigates whether large language\nmodels (LLMs) can be prompted effectively with a set of 18 credibility signals\nto produce weak labels for each signal. We then aggregate these potentially\nnoisy labels using weak supervision in order to predict content veracity. We\ndemonstrate that our approach, which combines zero-shot LLM credibility signal\nlabeling and weak supervision, outperforms state-of-the-art classifiers on two\nmisinformation datasets without using any ground-truth labels for training. We\nalso analyse the contribution of the individual credibility signals towards\npredicting content veracity, which provides new valuable insights into their\nrole in misinformation detection.\n","authors":["João A. Leite","Olesya Razuvayevskaya","Kalina Bontcheva","Carolina Scarton"],"pdf_url":"https://arxiv.org/pdf/2309.07601v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07593v1","updated":"2023-09-14T10:53:36Z","published":"2023-09-14T10:53:36Z","title":"Statistically Valid Variable Importance Assessment through Conditional\n Permutations","summary":" Variable importance assessment has become a crucial step in machine-learning\napplications when using complex learners, such as deep neural networks, on\nlarge-scale data. Removal-based importance assessment is currently the\nreference approach, particularly when statistical guarantees are sought to\njustify variable inclusion. It is often implemented with variable permutation\nschemes. On the flip side, these approaches risk misidentifying unimportant\nvariables as important in the presence of correlations among covariates. Here\nwe develop a systematic approach for studying Conditional Permutation\nImportance (CPI) that is model agnostic and computationally lean, as well as\nreusable benchmarks of state-of-the-art variable importance estimators. We show\ntheoretically and empirically that $\\textit{CPI}$ overcomes the limitations of\nstandard permutation importance by providing accurate type-I error control.\nWhen used with a deep neural network, $\\textit{CPI}$ consistently showed top\naccuracy across benchmarks. An empirical benchmark on real-world data analysis\nin a large-scale medical dataset showed that $\\textit{CPI}$ provides a more\nparsimonious selection of statistically significant variables. Our results\nsuggest that $\\textit{CPI}$ can be readily used as drop-in replacement for\npermutation-based methods.\n","authors":["Ahmad Chamma","Denis A. Engemann","Bertrand Thirion"],"pdf_url":"https://arxiv.org/pdf/2309.07593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07097v2","updated":"2023-09-14T10:43:14Z","published":"2023-04-14T12:36:43Z","title":"Interpretable Weighted Siamese Network to Predict the Time to Onset of\n Alzheimer's Disease from MRI Images","summary":" Alzheimer's Disease (AD) is a progressive disease preceded by Mild Cognitive\nImpairment (MCI). Early detection of AD is crucial for making treatment\ndecisions. However, most of the literature on computer-assisted detection of AD\nfocuses on classifying brain images into one of three major categories:\nhealthy, MCI, and AD; or categorizing MCI patients into (1) progressive: those\nwho progress from MCI to AD at a future examination time, and (2) stable: those\nwho stay as MCI and never progress to AD. This misses the opportunity to\naccurately identify the trajectory of progressive MCI patients. In this paper,\nwe revisit the brain image classification task for AD identification and\nre-frame it as an ordinal classification task to predict how close a patient is\nto the severe AD stage. To this end, we select progressive MCI patients from\nthe Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset and construct an\nordinal dataset with a prediction target that indicates the time to progression\nto AD. We train a Siamese network model to predict the time to onset of AD\nbased on MRI brain images. We also propose a Weighted variety of Siamese\nnetwork and compare its performance to a baseline model. Our evaluations show\nthat incorporating a weighting factor to Siamese networks brings considerable\nperformance gain at predicting how close input brain MRI images are to\nprogressing to AD. Moreover, we complement our results with an interpretation\nof the learned embedding space of the Siamese networks using a model\nexplainability technique.\n","authors":["Misgina Tsighe Hagos","Niamh Belton","Ronan P. Killeen","Kathleen M. Curran","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2304.07097v2.pdf","comment":"Accepted at the Specialist Group on Artificial Intelligence, SGAI\n 2023, conference"},{"id":"http://arxiv.org/abs/2309.07579v1","updated":"2023-09-14T10:23:43Z","published":"2023-09-14T10:23:43Z","title":"Structure-Preserving Transformers for Sequences of SPD Matrices","summary":" In recent years, Transformer-based auto-attention mechanisms have been\nsuccessfully applied to the analysis of a variety of context-reliant data\ntypes, from texts to images and beyond, including data from non-Euclidean\ngeometries. In this paper, we present such a mechanism, designed to classify\nsequences of Symmetric Positive Definite matrices while preserving their\nRiemannian geometry throughout the analysis. We apply our method to automatic\nsleep staging on timeseries of EEG-derived covariance matrices from a standard\ndataset, obtaining high levels of stage-wise performance.\n","authors":["Mathieu Seraphim","Alexis Lechervy","Florian Yger","Luc Brun","Olivier Etard"],"pdf_url":"https://arxiv.org/pdf/2309.07579v1.pdf","comment":"Submitted to the ICASSP 2024 Conference"},{"id":"http://arxiv.org/abs/2309.07578v1","updated":"2023-09-14T10:22:33Z","published":"2023-09-14T10:22:33Z","title":"Equivariant Data Augmentation for Generalization in Offline\n Reinforcement Learning","summary":" We present a novel approach to address the challenge of generalization in\noffline reinforcement learning (RL), where the agent learns from a fixed\ndataset without any additional interaction with the environment. Specifically,\nwe aim to improve the agent's ability to generalize to out-of-distribution\ngoals. To achieve this, we propose to learn a dynamics model and check if it is\nequivariant with respect to a fixed type of transformation, namely translations\nin the state space. We then use an entropy regularizer to increase the\nequivariant set and augment the dataset with the resulting transformed samples.\nFinally, we learn a new policy offline based on the augmented dataset, with an\noff-the-shelf offline RL algorithm. Our experimental results demonstrate that\nour approach can greatly improve the test performance of the policy on the\nconsidered environments.\n","authors":["Cristina Pinneri","Sarah Bechtle","Markus Wulfmeier","Arunkumar Byravan","Jingwei Zhang","William F. Whitney","Martin Riedmiller"],"pdf_url":"https://arxiv.org/pdf/2309.07578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07550v1","updated":"2023-09-14T09:26:03Z","published":"2023-09-14T09:26:03Z","title":"Naturalistic Robot Arm Trajectory Generation via Representation Learning","summary":" The integration of manipulator robots in household environments suggests a\nneed for more predictable and human-like robot motion. This holds especially\ntrue for wheelchair-mounted assistive robots that can support the independence\nof people with paralysis. One method of generating naturalistic motion\ntrajectories is via the imitation of human demonstrators. This paper explores a\nself-supervised imitation learning method using an autoregressive\nspatio-temporal graph neural network for an assistive drinking task. We address\nlearning from diverse human motion trajectory data that were captured via\nwearable IMU sensors on a human arm as the action-free task demonstrations.\nObserved arm motion data from several participants is used to generate natural\nand functional drinking motion trajectories for a UR5e robot arm.\n","authors":["Jayjun Lee","Adam J. Spiers"],"pdf_url":"https://arxiv.org/pdf/2309.07550v1.pdf","comment":"4 pages, 3 figures"},{"id":"http://arxiv.org/abs/2309.00855v3","updated":"2023-09-14T09:24:28Z","published":"2023-09-02T08:01:32Z","title":"DoRA: Domain-Based Self-Supervised Learning Framework for Low-Resource\n Real Estate Appraisal","summary":" The marketplace system connecting demands and supplies has been explored to\ndevelop unbiased decision-making in valuing properties. Real estate appraisal\nserves as one of the high-cost property valuation tasks for financial\ninstitutions since it requires domain experts to appraise the estimation based\non the corresponding knowledge and the judgment of the market. Existing\nautomated valuation models reducing the subjectivity of domain experts require\na large number of transactions for effective evaluation, which is predominantly\nlimited to not only the labeling efforts of transactions but also the\ngeneralizability of new developing and rural areas. To learn representations\nfrom unlabeled real estate sets, existing self-supervised learning (SSL) for\ntabular data neglects various important features, and fails to incorporate\ndomain knowledge. In this paper, we propose DoRA, a Domain-based\nself-supervised learning framework for low-resource Real estate Appraisal. DoRA\nis pre-trained with an intra-sample geographic prediction as the pretext task\nbased on the metadata of the real estate for equipping the real estate\nrepresentations with prior domain knowledge. Furthermore, inter-sample\ncontrastive learning is employed to generalize the representations to be robust\nfor limited transactions of downstream tasks. Our benchmark results on three\nproperty types of real-world transactions show that DoRA significantly\noutperforms the SSL baselines for tabular data, the graph-based methods, and\nthe supervised approaches in the few-shot scenarios by at least 7.6% for MAPE,\n11.59% for MAE, and 3.34% for HR10%. We expect DoRA to be useful to other\nfinancial practitioners with similar marketplace applications who need general\nmodels for properties that are newly built and have limited records. The source\ncode is available at https://github.com/wwweiwei/DoRA.\n","authors":["Wei-Wei Du","Wei-Yao Wang","Wen-Chih Peng"],"pdf_url":"https://arxiv.org/pdf/2309.00855v3.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2309.07548v1","updated":"2023-09-14T09:20:21Z","published":"2023-09-14T09:20:21Z","title":"Proximal Bellman mappings for reinforcement learning and their\n application to robust adaptive filtering","summary":" This paper aims at the algorithmic/theoretical core of reinforcement learning\n(RL) by introducing the novel class of proximal Bellman mappings. These\nmappings are defined in reproducing kernel Hilbert spaces (RKHSs), to benefit\nfrom the rich approximation properties and inner product of RKHSs, they are\nshown to belong to the powerful Hilbertian family of (firmly) nonexpansive\nmappings, regardless of the values of their discount factors, and possess ample\ndegrees of design freedom to even reproduce attributes of the classical Bellman\nmappings and to pave the way for novel RL designs. An approximate\npolicy-iteration scheme is built on the proposed class of mappings to solve the\nproblem of selecting online, at every time instance, the \"optimal\" exponent $p$\nin a $p$-norm loss to combat outliers in linear adaptive filtering, without\ntraining data and any knowledge on the statistical properties of the outliers.\nNumerical tests on synthetic data showcase the superior performance of the\nproposed framework over several non-RL and kernel-based RL schemes.\n","authors":["Yuki Akiyama","Konstantinos Slavakis"],"pdf_url":"https://arxiv.org/pdf/2309.07548v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2210.11755"},{"id":"http://arxiv.org/abs/2309.07544v1","updated":"2023-09-14T09:15:34Z","published":"2023-09-14T09:15:34Z","title":"VerilogEval: Evaluating Large Language Models for Verilog Code\n Generation","summary":" The increasing popularity of large language models (LLMs) has paved the way\nfor their application in diverse domains. This paper proposes a benchmarking\nframework tailored specifically for evaluating LLM performance in the context\nof Verilog code generation for hardware design and verification. We present a\ncomprehensive evaluation dataset consisting of 156 problems from the Verilog\ninstructional website HDLBits. The evaluation set consists of a diverse set of\nVerilog code generation tasks, ranging from simple combinational circuits to\ncomplex finite state machines. The Verilog code completions can be\nautomatically tested for functional correctness by comparing the transient\nsimulation outputs of the generated design with a golden solution. We also\ndemonstrate that the Verilog code generation capability of pretrained language\nmodels could be improved with supervised fine-tuning by bootstrapping with LLM\ngenerated synthetic problem-code pairs.\n","authors":["Mingjie Liu","Nathaniel Pinckney","Brucek Khailany","Haoxing Ren"],"pdf_url":"https://arxiv.org/pdf/2309.07544v1.pdf","comment":"ICCAD 2023 Invited Paper"},{"id":"http://arxiv.org/abs/2309.07030v2","updated":"2023-09-14T09:07:44Z","published":"2023-09-13T15:36:39Z","title":"Optimal transport distances for directed, weighted graphs: a case study\n with cell-cell communication networks","summary":" Comparing graphs by means of optimal transport has recently gained\nsignificant attention, as the distances induced by optimal transport provide\nboth a principled metric between graphs as well as an interpretable description\nof the associated changes between graphs in terms of a transport plan. As the\nlack of symmetry introduces challenges in the typically considered\nformulations, optimal transport distances for graphs have mostly been developed\nfor undirected graphs. Here, we propose two distance measures to compare\ndirected graphs based on variants of optimal transport: (i) an earth movers\ndistance (Wasserstein) and (ii) a Gromov-Wasserstein (GW) distance. We evaluate\nthese two distances and discuss their relative performance for both simulated\ngraph data and real-world directed cell-cell communication graphs, inferred\nfrom single-cell RNA-seq data.\n","authors":["James S. Nagai","Ivan G. Costa","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2309.07030v2.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2309.07530v1","updated":"2023-09-14T08:56:31Z","published":"2023-09-14T08:56:31Z","title":"Adaptive approximation of monotone functions","summary":" We study the classical problem of approximating a non-decreasing function $f:\n\\mathcal{X} \\to \\mathcal{Y}$ in $L^p(\\mu)$ norm by sequentially querying its\nvalues, for known compact real intervals $\\mathcal{X}$, $\\mathcal{Y}$ and a\nknown probability measure $\\mu$ on $\\cX$. For any function~$f$ we characterize\nthe minimum number of evaluations of $f$ that algorithms need to guarantee an\napproximation $\\hat{f}$ with an $L^p(\\mu)$ error below $\\epsilon$ after\nstopping. Unlike worst-case results that hold uniformly over all $f$, our\ncomplexity measure is dependent on each specific function $f$. To address this\nproblem, we introduce GreedyBox, a generalization of an algorithm originally\nproposed by Novak (1992) for numerical integration. We prove that GreedyBox\nachieves an optimal sample complexity for any function $f$, up to logarithmic\nfactors. Additionally, we uncover results regarding piecewise-smooth functions.\nPerhaps as expected, the $L^p(\\mu)$ error of GreedyBox decreases much faster\nfor piecewise-$C^2$ functions than predicted by the algorithm (without any\nknowledge on the smoothness of $f$). A simple modification even achieves\noptimal minimax approximation rates for such functions, which we compute\nexplicitly. In particular, our findings highlight multiple performance gaps\nbetween adaptive and non-adaptive algorithms, smooth and piecewise-smooth\nfunctions, as well as monotone or non-monotone functions. Finally, we provide\nnumerical experiments to support our theoretical results.\n","authors":["Pierre Gaillard","Sébastien Gerchinovitz","Étienne de Montbrun"],"pdf_url":"https://arxiv.org/pdf/2309.07530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07526v1","updated":"2023-09-14T08:49:35Z","published":"2023-09-14T08:49:35Z","title":"Learning Beyond Similarities: Incorporating Dissimilarities between\n Positive Pairs in Self-Supervised Time Series Learning","summary":" By identifying similarities between successive inputs, Self-Supervised\nLearning (SSL) methods for time series analysis have demonstrated their\neffectiveness in encoding the inherent static characteristics of temporal data.\nHowever, an exclusive emphasis on similarities might result in representations\nthat overlook the dynamic attributes critical for modeling cardiovascular\ndiseases within a confined subject cohort. Introducing Distilled Encoding\nBeyond Similarities (DEBS), this paper pioneers an SSL approach that transcends\nmere similarities by integrating dissimilarities among positive pairs. The\nframework is applied to electrocardiogram (ECG) signals, leading to a notable\nenhancement of +10\\% in the detection accuracy of Atrial Fibrillation (AFib)\nacross diverse subjects. DEBS underscores the potential of attaining a more\nrefined representation by encoding the dynamic characteristics of time series\ndata, tapping into dissimilarities during the optimization process. Broadly,\nthe strategy delineated in this study holds the promise of unearthing novel\navenues for advancing SSL methodologies tailored to temporal data.\n","authors":["Adrian Atienza","Jakob Bardram","Sadasivan Puthusserypady"],"pdf_url":"https://arxiv.org/pdf/2309.07526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05969v2","updated":"2023-09-14T08:42:33Z","published":"2023-08-11T07:07:21Z","title":"Learning nonparametric DAGs with incremental information via high-order\n HSIC","summary":" Score-based methods for learning Bayesain networks(BN) aim to maximizing the\nglobal score functions. However, if local variables have direct and indirect\ndependence simultaneously, the global optimization on score functions misses\nedges between variables with indirect dependent relationship, of which scores\nare smaller than those with direct dependent relationship. In this paper, we\npresent an identifiability condition based on a determined subset of parents to\nidentify the underlying DAG. By the identifiability condition, we develop a\ntwo-phase algorithm namely optimal-tuning (OT) algorithm to locally amend the\nglobal optimization. In the optimal phase, an optimization problem based on\nfirst-order Hilbert-Schmidt independence criterion (HSIC) gives an estimated\nskeleton as the initial determined parents subset. In the tuning phase, the\nskeleton is locally tuned by deletion, addition and DAG-formalization\nstrategies using the theoretically proved incremental properties of high-order\nHSIC. Numerical experiments for different synthetic datasets and real-world\ndatasets show that the OT algorithm outperforms existing methods. Especially in\nSigmoid Mix model with the size of the graph being ${\\rm\\bf d=40}$, the\nstructure intervention distance (SID) of the OT algorithm is 329.7 smaller than\nthe one obtained by CAM, which indicates that the graph estimated by the OT\nalgorithm misses fewer edges compared with CAM.Source code of the OT algorithm\nis available at https://github.com/YafeiannWang/optimal-tune-algorithm.\n","authors":["Yafei Wang","Jianguo Liu"],"pdf_url":"https://arxiv.org/pdf/2308.05969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07937v2","updated":"2023-09-14T08:23:11Z","published":"2023-05-31T07:36:45Z","title":"Gibbs-Duhem-Informed Neural Networks for Binary Activity Coefficient\n Prediction","summary":" We propose Gibbs-Duhem-informed neural networks for the prediction of binary\nactivity coefficients at varying compositions. That is, we include the\nGibbs-Duhem equation explicitly in the loss function for training neural\nnetworks, which is straightforward in standard machine learning (ML) frameworks\nenabling automatic differentiation. In contrast to recent hybrid ML approaches,\nour approach does not rely on embedding a specific thermodynamic model inside\nthe neural network and corresponding prediction limitations. Rather,\nGibbs-Duhem consistency serves as regularization, with the flexibility of ML\nmodels being preserved. Our results show increased thermodynamic consistency\nand generalization capabilities for activity coefficient predictions by\nGibbs-Duhem-informed graph neural networks and matrix completion methods. We\nalso find that the model architecture, particularly the activation function,\ncan have a strong influence on the prediction quality. The approach can be\neasily extended to account for other thermodynamic consistency conditions.\n","authors":["Jan G. Rittig","Kobi C. Felton","Alexei A. Lapkin","Alexander Mitsos"],"pdf_url":"https://arxiv.org/pdf/2306.07937v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.03609v5","updated":"2023-09-14T08:15:55Z","published":"2022-02-08T02:49:09Z","title":"PolicyCleanse: Backdoor Detection and Mitigation in Reinforcement\n Learning","summary":" While real-world applications of reinforcement learning are becoming popular,\nthe security and robustness of RL systems are worthy of more attention and\nexploration. In particular, recent works have revealed that, in a multi-agent\nRL environment, backdoor trigger actions can be injected into a victim agent\n(a.k.a. Trojan agent), which can result in a catastrophic failure as soon as it\nsees the backdoor trigger action. To ensure the security of RL agents against\nmalicious backdoors, in this work, we propose the problem of Backdoor Detection\nin a multi-agent competitive reinforcement learning system, with the objective\nof detecting Trojan agents as well as the corresponding potential trigger\nactions, and further trying to mitigate their Trojan behavior. In order to\nsolve this problem, we propose PolicyCleanse that is based on the property that\nthe activated Trojan agents accumulated rewards degrade noticeably after\nseveral timesteps. Along with PolicyCleanse, we also design a machine\nunlearning-based approach that can effectively mitigate the detected backdoor.\nExtensive experiments demonstrate that the proposed methods can accurately\ndetect Trojan agents, and outperform existing backdoor mitigation baseline\napproaches by at least 3% in winning rate across various types of agents and\nenvironments.\n","authors":["Junfeng Guo","Ang Li","Cong Liu"],"pdf_url":"https://arxiv.org/pdf/2202.03609v5.pdf","comment":"Accepted by ICCV 2023"},{"id":"http://arxiv.org/abs/2210.05845v5","updated":"2023-09-14T07:55:59Z","published":"2022-10-12T00:35:45Z","title":"ConSpec: honing in on critical steps for rapid learning and\n generalization in RL","summary":" In real life, success is often contingent upon multiple critical steps that\nare distant in time from each other and from the final reward. These critical\nsteps are challenging to identify with traditional reinforcement learning (RL)\nmethods that rely on the Bellman equation for credit assignment. Here, we\npresent a new RL algorithm that uses offline contrastive learning to hone in on\ncritical steps. This algorithm, which we call contrastive introspection\n(ConSpec), can be added to any existing RL algorithm. ConSpec learns a set of\nprototypes for the critical steps in a task by a novel contrastive loss and\ndelivers an intrinsic reward when the current state matches one of these\nprototypes. The prototypes in ConSpec provide two key benefits for credit\nassignment: (1) They enable rapid identification of all the critical steps. (2)\nThey do so in a readily interpretable manner, enabling out-of-distribution\ngeneralization when sensory features are altered. Distinct from other\ncontemporary RL approaches to credit assignment, ConSpec takes advantage of the\nfact that it is easier to retrospectively identify the small set of steps that\nsuccess is contingent upon than it is to prospectively predict reward at every\nstep taken in the environment. Altogether, ConSpec improves learning in a\ndiverse set of RL tasks, including both those with explicit, discrete critical\nsteps and those with complex, continuous critical steps.\n","authors":["Chen Sun","Wannan Yang","Thomas Jiralerspong","Dane Malenfant","Benjamin Alsbury-Nealy","Yoshua Bengio","Blake Richards"],"pdf_url":"https://arxiv.org/pdf/2210.05845v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07486v1","updated":"2023-09-14T07:53:52Z","published":"2023-09-14T07:53:52Z","title":"Massively-Parallel Heat Map Sorting and Applications To Explainable\n Clustering","summary":" Given a set of points labeled with $k$ labels, we introduce the heat map\nsorting problem as reordering and merging the points and dimensions while\npreserving the clusters (labels). A cluster is preserved if it remains\nconnected, i.e., if it is not split into several clusters and no two clusters\nare merged.\n We prove the problem is NP-hard and we give a fixed-parameter algorithm with\na constant number of rounds in the massively parallel computation model, where\neach machine has a sublinear memory and the total memory of the machines is\nlinear. We give an approximation algorithm for a NP-hard special case of the\nproblem. We empirically compare our algorithm with k-means and density-based\nclustering (DBSCAN) using a dimensionality reduction via locality-sensitive\nhashing on several directed and undirected graphs of email and computer\nnetworks.\n","authors":["Sepideh Aghamolaei","Mohammad Ghodsi"],"pdf_url":"https://arxiv.org/pdf/2309.07486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07481v1","updated":"2023-09-14T07:40:10Z","published":"2023-09-14T07:40:10Z","title":"Improved Auto-Encoding using Deterministic Projected Belief Networks","summary":" In this paper, we exploit the unique properties of a deterministic projected\nbelief network (D-PBN) to take full advantage of trainable compound activation\nfunctions (TCAs). A D-PBN is a type of auto-encoder that operates by \"backing\nup\" through a feed-forward neural network. TCAs are activation functions with\ncomplex monotonic-increasing shapes that change the distribution of the data so\nthat the linear transformation that follows is more effective. Because a D-PBN\noperates by \"backing up\", the TCAs are inverted in the reconstruction process,\nrestoring the original distribution of the data, thus taking advantage of a\ngiven TCA in both analysis and reconstruction. In this paper, we show that a\nD-PBN auto-encoder with TCAs can significantly out-perform standard\nauto-encoders including variational auto-encoders.\n","authors":["Paul M Baggenstoss"],"pdf_url":"https://arxiv.org/pdf/2309.07481v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.03778v3","updated":"2023-09-14T07:36:15Z","published":"2023-04-06T19:56:47Z","title":"Conformal Regression in Calorie Prediction for Team Jumbo-Visma","summary":" UCI WorldTour races, the premier men's elite road cycling tour, are grueling\nevents that put physical fitness and endurance of riders to the test. The\ncoaches of Team Jumbo-Visma have long been responsible for predicting the\nenergy needs of each rider of the Dutch team for every race on the calendar.\nThose must be estimated to ensure riders have the energy and resources\nnecessary to maintain a high level of performance throughout a race. This task,\nhowever, is both time-consuming and challenging, as it requires precise\nestimates of race speed and power output. Traditionally, the approach to\npredicting energy needs has relied on judgement and experience of coaches, but\nthis method has its limitations and often leads to inaccurate predictions. In\nthis paper, we propose a new, more effective approach to predicting energy\nneeds for cycling races. By predicting the speed and power with regression\nmodels, we provide the coaches with calorie needs estimates for each individual\nrider per stage instantly. In addition, we compare methods to quantify\nuncertainty using conformal prediction. The empirical analysis of the\njackknife+, jackknife-minmax, jackknife-minmax-after-bootstrap, CV+, CV-minmax,\nconformalized quantile regression, and inductive conformal prediction methods\nin conformal prediction reveals that all methods achieve valid prediction\nintervals. All but minmax-based methods also produce sufficiently narrow\nprediction intervals for decision-making. Furthermore, methods computing\nprediction intervals of fixed size produce tighter intervals for low\nsignificance values. Among the methods computing intervals of varying length\nacross the input space, inductive conformal prediction computes narrower\nprediction intervals at larger significance level.\n","authors":["Kristian van Kuijk","Mark Dirksen","Christof Seiler"],"pdf_url":"https://arxiv.org/pdf/2304.03778v3.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.07478v1","updated":"2023-09-14T07:35:14Z","published":"2023-09-14T07:35:14Z","title":"Direct Text to Speech Translation System using Acoustic Units","summary":" This paper proposes a direct text to speech translation system using discrete\nacoustic units. This framework employs text in different source languages as\ninput to generate speech in the target language without the need for text\ntranscriptions in this language. Motivated by the success of acoustic units in\nprevious works for direct speech to speech translation systems, we use the same\npipeline to extract the acoustic units using a speech encoder combined with a\nclustering algorithm. Once units are obtained, an encoder-decoder architecture\nis trained to predict them. Then a vocoder generates speech from units. Our\napproach for direct text to speech translation was tested on the new CVSS\ncorpus with two different text mBART models employed as initialisation. The\nsystems presented report competitive performance for most of the language pairs\nevaluated. Besides, results show a remarkable improvement when initialising our\nproposed architecture with a model pre-trained with more languages.\n","authors":["Victoria Mingote","Pablo Gimeno","Luis Vicente","Sameer Khurana","Antoine Laurent","Jarod Duret"],"pdf_url":"https://arxiv.org/pdf/2309.07478v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2112.07611v3","updated":"2023-09-14T07:16:08Z","published":"2021-12-14T18:03:43Z","title":"Speeding up Learning Quantum States through Group Equivariant\n Convolutional Quantum Ansätze","summary":" We develop a theoretical framework for $S_n$-equivariant convolutional\nquantum circuits with SU$(d)$-symmetry, building on and significantly\ngeneralizing Jordan's Permutational Quantum Computing (PQC) formalism based on\nSchur-Weyl duality connecting both SU$(d)$ and $S_n$ actions on qudits. In\nparticular, we utilize the Okounkov-Vershik approach to prove Harrow's\nstatement (Ph.D. Thesis 2005 p.160) on the equivalence between\n$\\operatorname{SU}(d)$ and $S_n$ irrep bases and to establish the\n$S_n$-equivariant Convolutional Quantum Alternating Ans\\\"atze ($S_n$-CQA) using\nYoung-Jucys-Murphy (YJM) elements. We prove that $S_n$-CQA is able to generate\nany unitary in any given $S_n$ irrep sector, which may serve as a universal\nmodel for a wide array of quantum machine learning problems with the presence\nof SU($d$) symmetry. Our method provides another way to prove the universality\nof Quantum Approximate Optimization Algorithm (QAOA) and verifies that 4-local\nSU($d$) symmetric unitaries are sufficient to build generic SU($d$) symmetric\nquantum circuits up to relative phase factors. We present numerical simulations\nto showcase the effectiveness of the ans\\\"atze to find the ground state energy\nof the $J_1$--$J_2$ antiferromagnetic Heisenberg model on the rectangular and\nKagome lattices. Our work provides the first application of the celebrated\nOkounkov-Vershik's $S_n$ representation theory to quantum physics and machine\nlearning, from which to propose quantum variational ans\\\"atze that strongly\nsuggests to be classically intractable tailored towards a specific optimization\nproblem.\n","authors":["Han Zheng","Zimu Li","Junyu Liu","Sergii Strelchuk","Risi Kondor"],"pdf_url":"https://arxiv.org/pdf/2112.07611v3.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2210.00305v3","updated":"2023-09-14T07:06:03Z","published":"2022-10-01T16:01:53Z","title":"LambdaKG: A Library for Pre-trained Language Model-Based Knowledge Graph\n Embeddings","summary":" Knowledge Graphs (KGs) often have two characteristics: heterogeneous graph\nstructure and text-rich entity/relation information. Text-based KG embeddings\ncan represent entities by encoding descriptions with pre-trained language\nmodels, but no open-sourced library is specifically designed for KGs with PLMs\nat present. In this paper, we present LambdaKG, a library for KGE that equips\nwith many pre-trained language models (e.g., BERT, BART, T5, GPT-3), and\nsupports various tasks (e.g., knowledge graph completion, question answering,\nrecommendation, and knowledge probing). LambdaKG is publicly open-sourced at\nhttps://github.com/zjunlp/PromptKG/tree/main/lambdaKG, with a demo video at\nhttp://deepke.zjukg.cn/lambdakg.mp4 and long-term maintenance.\n","authors":["Xin Xie","Zhoubo Li","Xiaohan Wang","Zekun Xi","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.00305v3.pdf","comment":"AACL 2023 System Demonstrations, the project website is\n https://zjunlp.github.io/project/promptkg/"},{"id":"http://arxiv.org/abs/2309.07461v1","updated":"2023-09-14T06:41:45Z","published":"2023-09-14T06:41:45Z","title":"Detecting Unknown Attacks in IoT Environments: An Open Set Classifier\n for Enhanced Network Intrusion Detection","summary":" The widespread integration of Internet of Things (IoT) devices across all\nfacets of life has ushered in an era of interconnectedness, creating new\navenues for cybersecurity challenges and underscoring the need for robust\nintrusion detection systems. However, traditional security systems are designed\nwith a closed-world perspective and often face challenges in dealing with the\never-evolving threat landscape, where new and unfamiliar attacks are constantly\nemerging. In this paper, we introduce a framework aimed at mitigating the open\nset recognition (OSR) problem in the realm of Network Intrusion Detection\nSystems (NIDS) tailored for IoT environments. Our framework capitalizes on\nimage-based representations of packet-level data, extracting spatial and\ntemporal patterns from network traffic. Additionally, we integrate stacking and\nsub-clustering techniques, enabling the identification of unknown attacks by\neffectively modeling the complex and diverse nature of benign behavior. The\nempirical results prominently underscore the framework's efficacy, boasting an\nimpressive 88\\% detection rate for previously unseen attacks when compared\nagainst existing approaches and recent advancements. Future work will perform\nextensive experimentation across various openness levels and attack scenarios,\nfurther strengthening the adaptability and performance of our proposed solution\nin safeguarding IoT environments.\n","authors":["Yasir Ali Farrukh","Syed Wali","Irfan Khan","Nathaniel D. Bastian"],"pdf_url":"https://arxiv.org/pdf/2309.07461v1.pdf","comment":"6 Pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.04824v2","updated":"2023-09-14T06:33:59Z","published":"2023-09-09T15:36:28Z","title":"Correcting sampling biases via importance reweighting for spatial\n modeling","summary":" In machine learning models, the estimation of errors is often complex due to\ndistribution bias, particularly in spatial data such as those found in\nenvironmental studies. We introduce an approach based on the ideas of\nimportance sampling to obtain an unbiased estimate of the target error. By\ntaking into account difference between desirable error and available data, our\nmethod reweights errors at each sample point and neutralizes the shift.\nImportance sampling technique and kernel density estimation were used for\nreweighteing. We validate the effectiveness of our approach using artificial\ndata that resemble real-world spatial datasets. Our findings demonstrate\nadvantages of the proposed approach for the estimation of the target error,\noffering a solution to a distribution shift problem. Overall error of\npredictions dropped from 7% to just 2% and it gets smaller for larger samples.\n","authors":["Boris Prokhorov","Diana Koldasbayeva","Alexey Zaytsev"],"pdf_url":"https://arxiv.org/pdf/2309.04824v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07453v1","updated":"2023-09-14T06:25:39Z","published":"2023-09-14T06:25:39Z","title":"SC-MAD: Mixtures of Higher-order Networks for Data Augmentation","summary":" The myriad complex systems with multiway interactions motivate the extension\nof graph-based pairwise connections to higher-order relations. In particular,\nthe simplicial complex has inspired generalizations of graph neural networks\n(GNNs) to simplicial complex-based models. Learning on such systems requires\nlarge amounts of data, which can be expensive or impossible to obtain. We\npropose data augmentation of simplicial complexes through both linear and\nnonlinear mixup mechanisms that return mixtures of existing labeled samples. In\naddition to traditional pairwise mixup, we present a convex clustering mixup\napproach for a data-driven relationship among several simplicial complexes. We\ntheoretically demonstrate that the resultant synthetic simplicial complexes\ninterpolate among existing data with respect to homomorphism densities. Our\nmethod is demonstrated on both synthetic and real-world datasets for simplicial\ncomplex classification.\n","authors":["Madeline Navarro","Santiago Segarra"],"pdf_url":"https://arxiv.org/pdf/2309.07453v1.pdf","comment":"5 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2309.07452v1","updated":"2023-09-14T06:24:33Z","published":"2023-09-14T06:24:33Z","title":"Is Solving Graph Neural Tangent Kernel Equivalent to Training Graph\n Neural Network?","summary":" A rising trend in theoretical deep learning is to understand why deep\nlearning works through Neural Tangent Kernel (NTK) [jgh18], a kernel method\nthat is equivalent to using gradient descent to train a multi-layer\ninfinitely-wide neural network. NTK is a major step forward in the theoretical\ndeep learning because it allows researchers to use traditional mathematical\ntools to analyze properties of deep neural networks and to explain various\nneural network techniques from a theoretical view. A natural extension of NTK\non graph learning is \\textit{Graph Neural Tangent Kernel (GNTK)}, and\nresearchers have already provide GNTK formulation for graph-level regression\nand show empirically that this kernel method can achieve similar accuracy as\nGNNs on various bioinformatics datasets [dhs+19]. The remaining question now is\nwhether solving GNTK regression is equivalent to training an infinite-wide\nmulti-layer GNN using gradient descent. In this paper, we provide three new\ntheoretical results. First, we formally prove this equivalence for graph-level\nregression. Second, we present the first GNTK formulation for node-level\nregression. Finally, we prove the equivalence for node-level regression.\n","authors":["Lianke Qin","Zhao Song","Baocheng Sun"],"pdf_url":"https://arxiv.org/pdf/2309.07452v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07450v1","updated":"2023-09-14T06:22:48Z","published":"2023-09-14T06:22:48Z","title":"TensorFlow Chaotic Prediction and Blow Up","summary":" Predicting the dynamics of chaotic systems is one of the most challenging\ntasks for neural networks, and machine learning in general. Here we aim to\npredict the spatiotemporal chaotic dynamics of a high-dimensional non-linear\nsystem. In our attempt we use the TensorFlow library, representing the state of\nthe art for deep neural networks training and prediction. While our results are\nencouraging, and show that the dynamics of the considered system can be\npredicted for short time, we also indirectly discovered an unexpected and\nundesirable behavior of the TensorFlow library. More specifically, the longer\nterm prediction of the system's chaotic behavior quickly deteriorates and blows\nup due to the nondeterministic behavior of the TensorFlow library. Here we\nprovide numerical evidence of the short time prediction ability, and of the\nlonger term predictability blow up.\n","authors":["M. Andrecut"],"pdf_url":"https://arxiv.org/pdf/2309.07450v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2105.06031v2","updated":"2023-09-14T06:09:09Z","published":"2021-05-13T01:40:20Z","title":"Joint Community Detection and Rotational Synchronization via\n Semidefinite Programming","summary":" In the presence of heterogeneous data, where randomly rotated objects fall\ninto multiple underlying categories, it is challenging to simultaneously\nclassify them into clusters and synchronize them based on pairwise relations.\nThis gives rise to the joint problem of community detection and\nsynchronization. We propose a series of semidefinite relaxations, and prove\ntheir exact recovery when extending the celebrated stochastic block model to\nthis new setting where both rotations and cluster identities are to be\ndetermined. Numerical experiments demonstrate the efficacy of our proposed\nalgorithms and confirm our theoretical result which indicates a sharp phase\ntransition for exact recovery.\n","authors":["Yifeng Fan","Yuehaw Khoo","Zhizhen Zhao"],"pdf_url":"https://arxiv.org/pdf/2105.06031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09597v7","updated":"2023-09-14T05:36:15Z","published":"2022-12-19T16:32:42Z","title":"Reasoning with Language Model Prompting: A Survey","summary":" Reasoning, as an essential ability for complex problem-solving, can provide\nback-end support for various real-world applications, such as medical\ndiagnosis, negotiation, etc. This paper provides a comprehensive survey of\ncutting-edge research on reasoning with language model prompting. We introduce\nresearch works with comparisons and summaries and provide systematic resources\nto help beginners. We also discuss the potential reasons for emerging such\nreasoning abilities and highlight future research directions. Resources are\navailable at https://github.com/zjunlp/Prompt4ReasoningPapers (updated\nperiodically).\n","authors":["Shuofei Qiao","Yixin Ou","Ningyu Zhang","Xiang Chen","Yunzhi Yao","Shumin Deng","Chuanqi Tan","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2212.09597v7.pdf","comment":"ACL 2023, 24 pages, add references of theoretical analysis"},{"id":"http://arxiv.org/abs/1906.00331v9","updated":"2023-09-14T04:59:49Z","published":"2019-06-02T03:03:45Z","title":"On Gradient Descent Ascent for Nonconvex-Concave Minimax Problems","summary":" We consider nonconvex-concave minimax problems, $\\min_{\\mathbf{x}}\n\\max_{\\mathbf{y} \\in \\mathcal{Y}} f(\\mathbf{x}, \\mathbf{y})$, where $f$ is\nnonconvex in $\\mathbf{x}$ but concave in $\\mathbf{y}$ and $\\mathcal{Y}$ is a\nconvex and bounded set. One of the most popular algorithms for solving this\nproblem is the celebrated gradient descent ascent (GDA) algorithm, which has\nbeen widely used in machine learning, control theory and economics. Despite the\nextensive convergence results for the convex-concave setting, GDA with equal\nstepsize can converge to limit cycles or even diverge in a general setting. In\nthis paper, we present the complexity results on two-time-scale GDA for solving\nnonconvex-concave minimax problems, showing that the algorithm can find a\nstationary point of the function $\\Phi(\\cdot) := \\max_{\\mathbf{y} \\in\n\\mathcal{Y}} f(\\cdot, \\mathbf{y})$ efficiently. To the best our knowledge, this\nis the first nonasymptotic analysis for two-time-scale GDA in this setting,\nshedding light on its superior practical performance in training generative\nadversarial networks (GANs) and other real applications.\n","authors":["Tianyi Lin","Chi Jin","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/1906.00331v9.pdf","comment":"Accepted by ICML 2020; Fix an error in Proposition 4.11 and 4.12 by\n modifying Definition 4.10 and some typos"},{"id":"http://arxiv.org/abs/2304.14131v2","updated":"2023-09-14T04:38:21Z","published":"2023-04-27T12:26:04Z","title":"TempEE: Temporal-Spatial Parallel Transformer for Radar Echo\n Extrapolation Beyond Auto-Regression","summary":" Meteorological radar reflectivity data (i.e. radar echo) significantly\ninfluences precipitation prediction. It can facilitate accurate and expeditious\nforecasting of short-term heavy rainfall bypassing the need for complex\nNumerical Weather Prediction (NWP) models. In comparison to conventional\nmodels, Deep Learning (DL)-based radar echo extrapolation algorithms exhibit\nhigher effectiveness and efficiency. Nevertheless, the development of reliable\nand generalized echo extrapolation algorithm is impeded by three primary\nchallenges: cumulative error spreading, imprecise representation of sparsely\ndistributed echoes, and inaccurate description of non-stationary motion\nprocesses. To tackle these challenges, this paper proposes a novel radar echo\nextrapolation algorithm called Temporal-Spatial Parallel Transformer, referred\nto as TempEE. TempEE avoids using auto-regression and instead employs a\none-step forward strategy to prevent cumulative error spreading during the\nextrapolation process. Additionally, we propose the incorporation of a\nMulti-level Temporal-Spatial Attention mechanism to improve the algorithm's\ncapability of capturing both global and local information while emphasizing\ntask-related regions, including sparse echo representations, in an efficient\nmanner. Furthermore, the algorithm extracts spatio-temporal representations\nfrom continuous echo images using a parallel encoder to model the\nnon-stationary motion process for echo extrapolation. The superiority of our\nTempEE has been demonstrated in the context of the classic radar echo\nextrapolation task, utilizing a real-world dataset. Extensive experiments have\nfurther validated the efficacy and indispensability of various components\nwithin TempEE.\n","authors":["Shengchao Chen","Ting Shu","Huan Zhao","Guo Zhong","Xunlai Chen"],"pdf_url":"https://arxiv.org/pdf/2304.14131v2.pdf","comment":"Have been accepted by IEEE Transactions on Geoscience and Remote\n Sensing, see https://ieeexplore.ieee.org/document/10238744"},{"id":"http://arxiv.org/abs/2309.07418v1","updated":"2023-09-14T04:23:40Z","published":"2023-09-14T04:23:40Z","title":"A Fast Optimization View: Reformulating Single Layer Attention in LLM\n Based on Tensor and SVM Trick, and Solving It in Matrix Multiplication Time","summary":" Large language models (LLMs) have played a pivotal role in revolutionizing\nvarious facets of our daily existence. Solving attention regression is a\nfundamental task in optimizing LLMs. In this work, we focus on giving a\nprovable guarantee for the one-layer attention network objective function\n$L(X,Y) = \\sum_{j_0 = 1}^n \\sum_{i_0 = 1}^d ( \\langle \\langle \\exp(\n\\mathsf{A}_{j_0} x ) , {\\bf 1}_n \\rangle^{-1} \\exp( \\mathsf{A}_{j_0} x ), A_{3}\nY_{*,i_0} \\rangle - b_{j_0,i_0} )^2$. Here $\\mathsf{A} \\in \\mathbb{R}^{n^2\n\\times d^2}$ is Kronecker product between $A_1 \\in \\mathbb{R}^{n \\times d}$ and\n$A_2 \\in \\mathbb{R}^{n \\times d}$. $A_3$ is a matrix in $\\mathbb{R}^{n \\times\nd}$, $\\mathsf{A}_{j_0} \\in \\mathbb{R}^{n \\times d^2}$ is the $j_0$-th block of\n$\\mathsf{A}$. The $X, Y \\in \\mathbb{R}^{d \\times d}$ are variables we want to\nlearn. $B \\in \\mathbb{R}^{n \\times d}$ and $b_{j_0,i_0} \\in \\mathbb{R}$ is one\nentry at $j_0$-th row and $i_0$-th column of $B$, $Y_{*,i_0} \\in \\mathbb{R}^d$\nis the $i_0$-column vector of $Y$, and $x \\in \\mathbb{R}^{d^2}$ is the\nvectorization of $X$.\n In a multi-layer LLM network, the matrix $B \\in \\mathbb{R}^{n \\times d}$ can\nbe viewed as the output of a layer, and $A_1= A_2 = A_3 \\in \\mathbb{R}^{n\n\\times d}$ can be viewed as the input of a layer. The matrix version of $x$ can\nbe viewed as $QK^\\top$ and $Y$ can be viewed as $V$. We provide an iterative\ngreedy algorithm to train loss function $L(X,Y)$ up $\\epsilon$ that runs in\n$\\widetilde{O}( ({\\cal T}_{\\mathrm{mat}}(n,n,d) + {\\cal\nT}_{\\mathrm{mat}}(n,d,d) + d^{2\\omega}) \\log(1/\\epsilon) )$ time. Here ${\\cal\nT}_{\\mathrm{mat}}(a,b,c)$ denotes the time of multiplying $a \\times b$ matrix\nanother $b \\times c$ matrix, and $\\omega\\approx 2.37$ denotes the exponent of\nmatrix multiplication.\n","authors":["Yeqi Gao","Zhao Song","Weixin Wang","Junze Yin"],"pdf_url":"https://arxiv.org/pdf/2309.07418v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07200v2","updated":"2023-09-14T03:55:02Z","published":"2023-08-14T15:10:29Z","title":"Neural Categorical Priors for Physics-Based Character Control","summary":" Recent advances in learning reusable motion priors have demonstrated their\neffectiveness in generating naturalistic behaviors. In this paper, we propose a\nnew learning framework in this paradigm for controlling physics-based\ncharacters with significantly improved motion quality and diversity over\nexisting state-of-the-art methods. The proposed method uses reinforcement\nlearning (RL) to initially track and imitate life-like movements from\nunstructured motion clips using the discrete information bottleneck, as adopted\nin the Vector Quantized Variational AutoEncoder (VQ-VAE). This structure\ncompresses the most relevant information from the motion clips into a compact\nyet informative latent space, i.e., a discrete space over vector quantized\ncodes. By sampling codes in the space from a trained categorical prior\ndistribution, high-quality life-like behaviors can be generated, similar to the\nusage of VQ-VAE in computer vision. Although this prior distribution can be\ntrained with the supervision of the encoder's output, it follows the original\nmotion clip distribution in the dataset and could lead to imbalanced behaviors\nin our setting. To address the issue, we further propose a technique named\nprior shifting to adjust the prior distribution using curiosity-driven RL. The\noutcome distribution is demonstrated to offer sufficient behavioral diversity\nand significantly facilitates upper-level policy learning for downstream tasks.\nWe conduct comprehensive experiments using humanoid characters on two\nchallenging downstream tasks, sword-shield striking and two-player boxing game.\nOur results demonstrate that the proposed framework is capable of controlling\nthe character to perform considerably high-quality movements in terms of\nbehavioral strategies, diversity, and realism. Videos, codes, and data are\navailable at https://tencent-roboticsx.github.io/NCP/.\n","authors":["Qingxu Zhu","He Zhang","Mengting Lan","Lei Han"],"pdf_url":"https://arxiv.org/pdf/2308.07200v2.pdf","comment":"Accepted to Transactions on Graphics (Proc. ACM SIGGRAPH ASIA 2023)"},{"id":"http://arxiv.org/abs/2204.10372v2","updated":"2023-09-14T03:53:31Z","published":"2022-04-21T19:03:17Z","title":"Model-free Learning of Regions of Attraction via Recurrent Sets","summary":" We consider the problem of learning an inner approximation of the region of\nattraction (ROA) of an asymptotically stable equilibrium point without an\nexplicit model of the dynamics. Rather than leveraging approximate models with\nbounded uncertainty to find a (robust) invariant set contained in the ROA, we\npropose to learn sets that satisfy a more relaxed notion of containment known\nas recurrence. We define a set to be $\\tau$-recurrent (resp. $k$-recurrent) if\nevery trajectory that starts within the set, returns to it after at most $\\tau$\nseconds (resp. $k$ steps). We show that under mild assumptions a\n$\\tau$-recurrent set containing a stable equilibrium must be a subset of its\nROA. We then leverage this property to develop algorithms that compute inner\napproximations of the ROA using counter-examples of recurrence that are\nobtained by sampling finite-length trajectories. Our algorithms process samples\nsequentially, which allow them to continue being executed even after an initial\noffline training stage. We further provide an upper bound on the number of\ncounter-examples used by the algorithm, and almost sure convergence guarantees.\n","authors":["Yue Shen","Maxim Bichuch","Enrique Mallada"],"pdf_url":"https://arxiv.org/pdf/2204.10372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07412v1","updated":"2023-09-14T03:36:01Z","published":"2023-09-14T03:36:01Z","title":"Advancing Regular Language Reasoning in Linear Recurrent Neural Networks","summary":" In recent studies, linear recurrent neural networks (LRNNs) have achieved\nTransformer-level performance in natural language modeling and long-range\nmodeling while offering rapid parallel training and constant inference costs.\nWith the resurged interest in LRNNs, we study whether they can learn the hidden\nrules in training sequences, such as the grammatical structures of regular\nlanguage. We theoretically analyze some existing LRNNs and discover their\nlimitations on regular language. Motivated by the analysis, we propose a new\nLRNN equipped with a block-diagonal and input-dependent transition matrix.\nExperiments suggest that the proposed model is the only LRNN that can perform\nlength extrapolation on regular language tasks such as Sum, Even Pair, and\nModular Arithmetic.\n","authors":["Ting-Han Fan","Ta-Chung Chi","Alexander I. Rudnicky"],"pdf_url":"https://arxiv.org/pdf/2309.07412v1.pdf","comment":"The first two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2309.07402v1","updated":"2023-09-14T03:15:57Z","published":"2023-09-14T03:15:57Z","title":"Semi-supervised Domain Adaptation on Graphs with Contrastive Learning\n and Minimax Entropy","summary":" Label scarcity in a graph is frequently encountered in real-world\napplications due to the high cost of data labeling. To this end,\nsemi-supervised domain adaptation (SSDA) on graphs aims to leverage the\nknowledge of a labeled source graph to aid in node classification on a target\ngraph with limited labels. SSDA tasks need to overcome the domain gap between\nthe source and target graphs. However, to date, this challenging research\nproblem has yet to be formally considered by the existing approaches designed\nfor cross-graph node classification. To tackle the SSDA problem on graphs, a\nnovel method called SemiGCL is proposed, which benefits from graph contrastive\nlearning and minimax entropy training. SemiGCL generates informative node\nrepresentations by contrasting the representations learned from a graph's local\nand global views. Additionally, SemiGCL is adversarially optimized with the\nentropy loss of unlabeled target nodes to reduce domain divergence.\nExperimental results on benchmark datasets demonstrate that SemiGCL outperforms\nthe state-of-the-art baselines on the SSDA tasks.\n","authors":["Jiaren Xiao","Quanyu Dai","Xiao Shen","Xiaochen Xie","Jing Dai","James Lam","Ka-Wai Kwok"],"pdf_url":"https://arxiv.org/pdf/2309.07402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.05928v4","updated":"2023-09-14T02:58:37Z","published":"2022-02-11T23:04:00Z","title":"Benign Overfitting without Linearity: Neural Network Classifiers Trained\n by Gradient Descent for Noisy Linear Data","summary":" Benign overfitting, the phenomenon where interpolating models generalize well\nin the presence of noisy data, was first observed in neural network models\ntrained with gradient descent. To better understand this empirical observation,\nwe consider the generalization error of two-layer neural networks trained to\ninterpolation by gradient descent on the logistic loss following random\ninitialization. We assume the data comes from well-separated class-conditional\nlog-concave distributions and allow for a constant fraction of the training\nlabels to be corrupted by an adversary. We show that in this setting, neural\nnetworks exhibit benign overfitting: they can be driven to zero training error,\nperfectly fitting any noisy training labels, and simultaneously achieve minimax\noptimal test error. In contrast to previous work on benign overfitting that\nrequire linear or kernel-based predictors, our analysis holds in a setting\nwhere both the model and learning dynamics are fundamentally nonlinear.\n","authors":["Spencer Frei","Niladri S. Chatterji","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2202.05928v4.pdf","comment":"39 pages; minor corrections"},{"id":"http://arxiv.org/abs/2309.07398v1","updated":"2023-09-14T02:57:48Z","published":"2023-09-14T02:57:48Z","title":"Semantic Adversarial Attacks via Diffusion Models","summary":" Traditional adversarial attacks concentrate on manipulating clean examples in\nthe pixel space by adding adversarial perturbations. By contrast, semantic\nadversarial attacks focus on changing semantic attributes of clean examples,\nsuch as color, context, and features, which are more feasible in the real\nworld. In this paper, we propose a framework to quickly generate a semantic\nadversarial attack by leveraging recent diffusion models since semantic\ninformation is included in the latent space of well-trained diffusion models.\nThen there are two variants of this framework: 1) the Semantic Transformation\n(ST) approach fine-tunes the latent space of the generated image and/or the\ndiffusion model itself; 2) the Latent Masking (LM) approach masks the latent\nspace with another target image and local backpropagation-based interpretation\nmethods. Additionally, the ST approach can be applied in either white-box or\nblack-box settings. Extensive experiments are conducted on CelebA-HQ and AFHQ\ndatasets, and our framework demonstrates great fidelity, generalizability, and\ntransferability compared to other baselines. Our approaches achieve\napproximately 100% attack success rate in multiple settings with the best FID\nas 36.61. Code is available at\nhttps://github.com/steven202/semantic_adv_via_dm.\n","authors":["Chenan Wang","Jinhao Duan","Chaowei Xiao","Edward Kim","Matthew Stamm","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2309.07398v1.pdf","comment":"To appear in BMVC 2023"},{"id":"http://arxiv.org/abs/2202.07626v4","updated":"2023-09-14T02:48:36Z","published":"2022-02-15T18:18:22Z","title":"Random Feature Amplification: Feature Learning and Generalization in\n Neural Networks","summary":" In this work, we provide a characterization of the feature-learning process\nin two-layer ReLU networks trained by gradient descent on the logistic loss\nfollowing random initialization. We consider data with binary labels that are\ngenerated by an XOR-like function of the input features. We permit a constant\nfraction of the training labels to be corrupted by an adversary. We show that,\nalthough linear classifiers are no better than random guessing for the\ndistribution we consider, two-layer ReLU networks trained by gradient descent\nachieve generalization error close to the label noise rate. We develop a novel\nproof technique that shows that at initialization, the vast majority of neurons\nfunction as random features that are only weakly correlated with useful\nfeatures, and the gradient descent dynamics 'amplify' these weak, random\nfeatures to strong, useful features.\n","authors":["Spencer Frei","Niladri S. Chatterji","Peter L. Bartlett"],"pdf_url":"https://arxiv.org/pdf/2202.07626v4.pdf","comment":"46 pages; JMLR camera ready revision"},{"id":"http://arxiv.org/abs/2309.06724v2","updated":"2023-09-14T02:44:57Z","published":"2023-09-13T04:57:12Z","title":"Deep Nonparametric Convexified Filtering for Computational Photography,\n Image Synthesis and Adversarial Defense","summary":" We aim to provide a general framework of for computational photography that\nrecovers the real scene from imperfect images, via the Deep Nonparametric\nConvexified Filtering (DNCF). It is consists of a nonparametric deep network to\nresemble the physical equations behind the image formation, such as denoising,\nsuper-resolution, inpainting, and flash. DNCF has no parameterization dependent\non training data, therefore has a strong generalization and robustness to\nadversarial image manipulation. During inference, we also encourage the network\nparameters to be nonnegative and create a bi-convex function on the input and\nparameters, and this adapts to second-order optimization algorithms with\ninsufficient running time, having 10X acceleration over Deep Image Prior. With\nthese tools, we empirically verify its capability to defend image\nclassification deep networks against adversary attack algorithms in real-time.\n","authors":["Jianqiao Wangni"],"pdf_url":"https://arxiv.org/pdf/2309.06724v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07391v1","updated":"2023-09-14T02:21:53Z","published":"2023-09-14T02:21:53Z","title":"EnCodecMAE: Leveraging neural codecs for universal audio representation\n learning","summary":" The goal of universal audio representation learning is to obtain foundational\nmodels that can be used for a variety of downstream tasks involving speech,\nmusic or environmental sounds. To approach this problem, methods inspired by\nself-supervised models from NLP, like BERT, are often used and adapted to\naudio. These models rely on the discrete nature of text, hence adopting this\ntype of approach for audio processing requires either a change in the learning\nobjective or mapping the audio signal to a set of discrete classes. In this\nwork, we explore the use of EnCodec, a neural audio codec, to generate discrete\ntargets for learning an universal audio model based on a masked autoencoder\n(MAE). We evaluate this approach, which we call EncodecMAE, on a wide range of\naudio tasks spanning speech, music and environmental sounds, achieving\nperformances comparable or better than leading audio representation models.\n","authors":["Leonardo Pepino","Pablo Riera","Luciana Ferrer"],"pdf_url":"https://arxiv.org/pdf/2309.07391v1.pdf","comment":"Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2309.07383v1","updated":"2023-09-14T02:02:08Z","published":"2023-09-14T02:02:08Z","title":"Rates of Convergence in Certain Native Spaces of Approximations used in\n Reinforcement Learning","summary":" This paper studies convergence rates for some value function approximations\nthat arise in a collection of reproducing kernel Hilbert spaces (RKHS)\n$H(\\Omega)$. By casting an optimal control problem in a specific class of\nnative spaces, strong rates of convergence are derived for the operator\nequation that enables offline approximations that appear in policy iteration.\nExplicit upper bounds on error in value function approximations are derived in\nterms of power function $\\Pwr_{H,N}$ for the space of finite dimensional\napproximants $H_N$ in the native space $H(\\Omega)$. These bounds are geometric\nin nature and refine some well-known, now classical results concerning\nconvergence of approximations of value functions.\n","authors":["Ali Bouland","Shengyuan Niu","Sai Tej Paruchuri","Andrew Kurdila","John Burns","Eugenio Schuster"],"pdf_url":"https://arxiv.org/pdf/2309.07383v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2309.06800v2","updated":"2023-09-14T01:59:02Z","published":"2023-09-13T08:48:00Z","title":"Uncertainty-aware Traffic Prediction under Missing Data","summary":" Traffic prediction is a crucial topic because of its broad scope of\napplications in the transportation domain. Recently, various studies have\nachieved promising results. However, most studies assume the prediction\nlocations have complete or at least partial historical records and cannot be\nextended to non-historical recorded locations. In real-life scenarios, the\ndeployment of sensors could be limited due to budget limitations and\ninstallation availability, which makes most current models not applicable.\nThough few pieces of literature tried to impute traffic states at the missing\nlocations, these methods need the data simultaneously observed at the locations\nwith sensors, making them not applicable to prediction tasks. Another drawback\nis the lack of measurement of uncertainty in prediction, making prior works\nunsuitable for risk-sensitive tasks or involving decision-making. To fill the\ngap, inspired by the previous inductive graph neural network, this work\nproposed an uncertainty-aware framework with the ability to 1) extend\nprediction to missing locations with no historical records and significantly\nextend spatial coverage of prediction locations while reducing deployment of\nsensors and 2) generate probabilistic prediction with uncertainty\nquantification to help the management of risk and decision making in the\ndown-stream tasks. Through extensive experiments on real-life datasets, the\nresult shows our method achieved promising results on prediction tasks, and the\nuncertainty quantification gives consistent results which highly correlated\nwith the locations with and without historical data. We also show that our\nmodel could help support sensor deployment tasks in the transportation field to\nachieve higher accuracy with a limited sensor deployment budget.\n","authors":["Hao Mei","Junxian Li","Zhiming Liang","Guanjie Zheng","Bin Shi","Hua Wei"],"pdf_url":"https://arxiv.org/pdf/2309.06800v2.pdf","comment":"11 pages, 3 figures, Accepted as a short paper of IEEE International\n Conference on Data Mining (ICDM) 2023"},{"id":"http://arxiv.org/abs/2008.05558v5","updated":"2023-09-14T01:22:30Z","published":"2020-08-12T20:09:34Z","title":"On the complexity of finding a local minimizer of a quadratic function\n over a polytope","summary":" We show that unless P=NP, there cannot be a polynomial-time algorithm that\nfinds a point within Euclidean distance $c^n$ (for any constant $c \\ge 0$) of a\nlocal minimizer of an $n$-variate quadratic function over a polytope. This\nresult (even with $c=0$) answers a question of Pardalos and Vavasis that\nappeared in 1992 on a list of seven open problems in complexity theory for\nnumerical optimization. Our proof technique also implies that the problem of\ndeciding whether a quadratic function has a local minimizer over an (unbounded)\npolyhedron, and that of deciding if a quartic polynomial has a local minimizer\nare NP-hard.\n","authors":["Amir Ali Ahmadi","Jeffrey Zhang"],"pdf_url":"https://arxiv.org/pdf/2008.05558v5.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2309.07374v1","updated":"2023-09-14T01:18:57Z","published":"2023-09-14T01:18:57Z","title":"Beta quantile regression for robust estimation of uncertainty in the\n presence of outliers","summary":" Quantile Regression (QR) can be used to estimate aleatoric uncertainty in\ndeep neural networks and can generate prediction intervals. Quantifying\nuncertainty is particularly important in critical applications such as clinical\ndiagnosis, where a realistic assessment of uncertainty is essential in\ndetermining disease status and planning the appropriate treatment. The most\ncommon application of quantile regression models is in cases where the\nparametric likelihood cannot be specified. Although quantile regression is\nquite robust to outlier response observations, it can be sensitive to outlier\ncovariate observations (features). Outlier features can compromise the\nperformance of deep learning regression problems such as style translation,\nimage reconstruction, and deep anomaly detection, potentially leading to\nmisleading conclusions. To address this problem, we propose a robust solution\nfor quantile regression that incorporates concepts from robust divergence. We\ncompare the performance of our proposed method with (i) least trimmed quantile\nregression and (ii) robust regression based on the regularization of\ncase-specific parameters in a simple real dataset in the presence of outlier.\nThese methods have not been applied in a deep learning framework. We also\ndemonstrate the applicability of the proposed method by applying it to a\nmedical imaging translation task using diffusion models.\n","authors":["Haleh Akrami","Omar Zamzam","Anand Joshi","Sergul Aydore","Richard Leahy"],"pdf_url":"https://arxiv.org/pdf/2309.07374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.04688v2","updated":"2023-09-14T01:15:47Z","published":"2022-10-07T07:56:17Z","title":"BAFFLE: Backdoor Attack in Offline Reinforcement Learning","summary":" A growing body of research has focused on the Reinforcement Learning (RL)\nmethods which allow the agent to learn from trial-and-error experiences\ngathered during the interaction with the environment. Recently, offline RL\nbecomes a popular RL paradigm because it saves the interactions with\nenvironments. In offline RL, data providers share large pre-collected datasets,\nand others can train high-quality agents without interacting with the\nenvironments. This paradigm has demonstrated effectiveness in critical tasks\nlike robot control, autonomous driving, etc. However, less attention is paid to\ninvestigating the security threats to the offline RL system. This paper focuses\non backdoor attacks, where some perturbations are added to the data\n(observations) such that given normal observations, the agent takes\nhigh-rewards actions, and low-reward actions on observations injected with\ntriggers. In this paper, we propose Baffle (Backdoor Attack for Offline\nReinforcement Learning), an approach that automatically implants backdoors to\nRL agents by poisoning the offline RL dataset, and evaluate how different\noffline RL algorithms react to this attack. Our experiments conducted on four\ntasks and four offline RL algorithms expose a disquieting fact: none of the\nexisting offline RL algorithms is immune to such a backdoor attack. Baffle\nmodifies $10\\%$ of the datasets for four tasks. Agents trained on the poisoned\ndatasets perform well in normal settings. However, when triggers are presented,\nthe agents' performance decreases drastically by $63.2\\%$, $53.9\\%$, $64.7\\%$,\nand $47.4\\%$ in the four tasks on average. The backdoor still persists after\nfine-tuning poisoned agents on clean datasets. We further show that the\ninserted backdoor is also hard to be detected by a popular defensive method.\nThis paper calls attention to developing more effective protection for the\nopen-source offline RL dataset.\n","authors":["Chen Gong","Zhou Yang","Yunpeng Bai","Junda He","Jieke Shi","Kecen Li","Arunesh Sinha","Bowen Xu","Xinwen Hou","David Lo","Tianhao Wang"],"pdf_url":"https://arxiv.org/pdf/2210.04688v2.pdf","comment":"18 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.07367v1","updated":"2023-09-14T01:00:05Z","published":"2023-09-14T01:00:05Z","title":"The kernel-balanced equation for deep neural networks","summary":" Deep neural networks have shown many fruitful applications in this decade. A\nnetwork can get the generalized function through training with a finite\ndataset. The degree of generalization is a realization of the proximity scale\nin the data space. Specifically, the scale is not clear if the dataset is\ncomplicated. Here we consider a network for the distribution estimation of the\ndataset. We show the estimation is unstable and the instability depends on the\ndata density and training duration. We derive the kernel-balanced equation,\nwhich gives a short phenomenological description of the solution. The equation\ntells us the reason for the instability and the mechanism of the scale. The\nnetwork outputs a local average of the dataset as a prediction and the scale of\naveraging is determined along the equation. The scale gradually decreases along\ntraining and finally results in instability in our case.\n","authors":["Kenichi Nakazato"],"pdf_url":"https://arxiv.org/pdf/2309.07367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06604v2","updated":"2023-09-14T00:49:47Z","published":"2023-09-12T21:07:23Z","title":"Hybrid Algorithm Selection and Hyperparameter Tuning on Distributed\n Machine Learning Resources: A Hierarchical Agent-based Approach","summary":" Algorithm selection and hyperparameter tuning are critical steps in both\nacademic and applied machine learning. On the other hand, these steps are\nbecoming ever increasingly delicate due to the extensive rise in the number,\ndiversity, and distributedness of machine learning resources. Multi-agent\nsystems, when applied to the design of machine learning platforms, bring about\nseveral distinctive characteristics such as scalability, flexibility, and\nrobustness, just to name a few. This paper proposes a fully automatic and\ncollaborative agent-based mechanism for selecting distributedly organized\nmachine learning algorithms and simultaneously tuning their hyperparameters.\nOur method builds upon an existing agent-based hierarchical machine-learning\nplatform and augments its query structure to support the aforementioned\nfunctionalities without being limited to specific learning, selection, and\ntuning mechanisms. We have conducted theoretical assessments, formal\nverification, and analytical study to demonstrate the correctness, resource\nutilization, and computational efficiency of our technique. According to the\nresults, our solution is totally correct and exhibits linear time and space\ncomplexity in relation to the size of available resources. To provide concrete\nexamples of how the proposed methodologies can effectively adapt and perform\nacross a range of algorithmic options and datasets, we have also conducted a\nseries of experiments using a system comprised of 24 algorithms and 9 datasets.\n","authors":["Ahmad Esmaeili","Julia T. Rayz","Eric T. Matson"],"pdf_url":"https://arxiv.org/pdf/2309.06604v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07364v1","updated":"2023-09-14T00:40:07Z","published":"2023-09-14T00:40:07Z","title":"Hodge-Aware Contrastive Learning","summary":" Simplicial complexes prove effective in modeling data with multiway\ndependencies, such as data defined along the edges of networks or within other\nhigher-order structures. Their spectrum can be decomposed into three\ninterpretable subspaces via the Hodge decomposition, resulting foundational in\nnumerous applications. We leverage this decomposition to develop a contrastive\nself-supervised learning approach for processing simplicial data and generating\nembeddings that encapsulate specific spectral information.Specifically, we\nencode the pertinent data invariances through simplicial neural networks and\ndevise augmentations that yield positive contrastive examples with suitable\nspectral properties for downstream tasks. Additionally, we reweight the\nsignificance of negative examples in the contrastive loss, considering the\nsimilarity of their Hodge components to the anchor. By encouraging a stronger\nseparation among less similar instances, we obtain an embedding space that\nreflects the spectral properties of the data. The numerical results on two\nstandard edge flow classification tasks show a superior performance even when\ncompared to supervised learning techniques. Our findings underscore the\nimportance of adopting a spectral perspective for contrastive learning with\nhigher-order data.\n","authors":["Alexander Möllers","Alexander Immer","Vincent Fortuin","Elvin Isufi"],"pdf_url":"https://arxiv.org/pdf/2309.07364v1.pdf","comment":"4 pages, 2 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2309.07773v1","updated":"2023-09-14T15:02:05Z","published":"2023-09-14T15:02:05Z","title":"Usability Evaluation of Spoken Humanoid Embodied Conversational Agents\n in Mobile Serious Games","summary":" This paper presents an empirical investigation of the extent to which spoken\nHumanoid Embodied Conversational Agents (HECAs) can foster usability in mobile\nserious game (MSG) applications. The aim of the research is to assess the\nimpact of multiple agents and illusion of humanness on the quality of the\ninteraction. The experiment investigates two styles of agent presentation: an\nagent of high human-likeness (HECA) and an agent of low human-likeness (text).\nThe purpose of the experiment is to assess whether and how agents of high\nhumanlikeness can evoke the illusion of humanness and affect usability. Agents\nof high human-likeness were designed by following the ECA design model that is\na proposed guide for ECA development. The results of the experiment with 90\nparticipants show that users prefer to interact with the HECAs. The difference\nbetween the two versions is statistically significant with a large effect size\n(d=1.01), with many of the participants justifying their choice by saying that\nthe human-like characteristics of the HECA made the version more appealing.\nThis research provides key information on the potential effect of HECAs on\nserious games, which can provide insight into the design of future mobile\nserious games.\n","authors":["Danai Korre","Judy Robertson"],"pdf_url":"https://arxiv.org/pdf/2309.07773v1.pdf","comment":"45 pages, 9 figures, 14 tables"},{"id":"http://arxiv.org/abs/2306.14565v2","updated":"2023-09-14T14:41:52Z","published":"2023-06-26T10:26:33Z","title":"Mitigating Hallucination in Large Multi-Modal Models via Robust\n Instruction Tuning","summary":" Despite the promising progress in multi-modal tasks, current large\nmulti-modal models (LMM) are prone to hallucinating inconsistent descriptions\nwith respect to the associated image and human instructions. This paper\naddresses this issue by introducing the first large and diverse visual\ninstruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction.\nOur dataset consists of 120k visual instructions generated by GPT4, covering 16\nvision-and-language tasks with open-ended instructions and answers. Unlike\nexisting studies that primarily focus on positive instruction samples, we\ndesign LRV-Instruction to include both positive and negative instructions for\nmore robust visual instruction tuning. Our negative instructions are designed\nat two semantic levels: (i) Nonexistent Element Manipulation and (ii) Existent\nElement Manipulation. To efficiently measure the hallucination generated by\nLMMs, we propose GPT4-Assisted Visual Instruction Evaluation (GAVIE), a novel\napproach to evaluate visual instruction tuning without the need for\nhuman-annotated groundtruth answers and can adapt to diverse instruction\nformats. We conduct comprehensive experiments to investigate the hallucination\nof LMMs. Our results demonstrate that existing LMMs exhibit significant\nhallucination when presented with our negative instructions, particularly with\nExistent Element Manipulation instructions. Moreover, by finetuning MiniGPT4 on\nLRV-Instruction, we successfully mitigate hallucination while improving\nperformance on public datasets using less training data compared to\nstate-of-the-art methods. Additionally, we observed that a balanced ratio of\npositive and negative instances in the training data leads to a more robust\nmodel. Updates of our project are available at\nhttps://fuxiaoliu.github.io/LRV/.\n","authors":["Fuxiao Liu","Kevin Lin","Linjie Li","Jianfeng Wang","Yaser Yacoob","Lijuan Wang"],"pdf_url":"https://arxiv.org/pdf/2306.14565v2.pdf","comment":"35 pages, 27 figures. Under Review"},{"id":"http://arxiv.org/abs/2309.01955v2","updated":"2023-09-14T13:10:34Z","published":"2023-09-05T05:06:48Z","title":"A Survey on Interpretable Cross-modal Reasoning","summary":" In recent years, cross-modal reasoning (CMR), the process of understanding\nand reasoning across different modalities, has emerged as a pivotal area with\napplications spanning from multimedia analysis to healthcare diagnostics. As\nthe deployment of AI systems becomes more ubiquitous, the demand for\ntransparency and comprehensibility in these systems' decision-making processes\nhas intensified. This survey delves into the realm of interpretable cross-modal\nreasoning (I-CMR), where the objective is not only to achieve high predictive\nperformance but also to provide human-understandable explanations for the\nresults. This survey presents a comprehensive overview of the typical methods\nwith a three-level taxonomy for I-CMR. Furthermore, this survey reviews the\nexisting CMR datasets with annotations for explanations. Finally, this survey\nsummarizes the challenges for I-CMR and discusses potential future directions.\nIn conclusion, this survey aims to catalyze the progress of this emerging\nresearch area by providing researchers with a panoramic and comprehensive\nperspective, illuminating the state of the art and discerning the\nopportunities. The summarized methods, datasets, and other resources are\navailable at\nhttps://github.com/ZuyiZhou/Awesome-Interpretable-Cross-modal-Reasoning.\n","authors":["Dizhan Xue","Shengsheng Qian","Zuyi Zhou","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2309.01955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07589v1","updated":"2023-09-14T10:46:55Z","published":"2023-09-14T10:46:55Z","title":"MPAI-EEV: Standardization Efforts of Artificial Intelligence based\n End-to-End Video Coding","summary":" The rapid advancement of artificial intelligence (AI) technology has led to\nthe prioritization of standardizing the processing, coding, and transmission of\nvideo using neural networks. To address this priority area, the Moving Picture,\nAudio, and Data Coding by Artificial Intelligence (MPAI) group is developing a\nsuite of standards called MPAI-EEV for \"end-to-end optimized neural video\ncoding.\" The aim of this AI-based video standard project is to compress the\nnumber of bits required to represent high-fidelity video data by utilizing\ndata-trained neural coding technologies. This approach is not constrained by\nhow data coding has traditionally been applied in the context of a hybrid\nframework. This paper presents an overview of recent and ongoing\nstandardization efforts in this area and highlights the key technologies and\ndesign philosophy of EEV. It also provides a comparison and report on some\nprimary efforts such as the coding efficiency of the reference model.\nAdditionally, it discusses emerging activities such as learned\nUnmanned-Aerial-Vehicles (UAVs) video coding which are currently planned, under\ndevelopment, or in the exploration phase. With a focus on UAV video signals,\nthis paper addresses the current status of these preliminary efforts. It also\nindicates development timelines, summarizes the main technical details, and\nprovides pointers to further points of reference. The exploration experiment\nshows that the EEV model performs better than the state-of-the-art video coding\nstandard H.266/VVC in terms of perceptual evaluation metric.\n","authors":["Chuanmin Jia","Feng Ye","Fanke Dong","Kai Lin","Leonardo Chiariglione","Siwei Ma","Huifang Sun","Wen Gao"],"pdf_url":"https://arxiv.org/pdf/2309.07589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06745v2","updated":"2023-09-14T07:13:24Z","published":"2023-09-13T06:31:35Z","title":"VEATIC: Video-based Emotion and Affect Tracking in Context Dataset","summary":" Human affect recognition has been a significant topic in psychophysics and\ncomputer vision. However, the currently published datasets have many\nlimitations. For example, most datasets contain frames that contain only\ninformation about facial expressions. Due to the limitations of previous\ndatasets, it is very hard to either understand the mechanisms for affect\nrecognition of humans or generalize well on common cases for computer vision\nmodels trained on those datasets. In this work, we introduce a brand new large\ndataset, the Video-based Emotion and Affect Tracking in Context Dataset\n(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has\n124 video clips from Hollywood movies, documentaries, and home videos with\ncontinuous valence and arousal ratings of each frame via real-time annotation.\nAlong with the dataset, we propose a new computer vision task to infer the\naffect of the selected character via both context and character information in\neach video frame. Additionally, we propose a simple model to benchmark this new\ncomputer vision task. We also compare the performance of the pretrained model\nusing our dataset with other similar datasets. Experiments show the competing\nresults of our pretrained model via VEATIC, indicating the generalizability of\nVEATIC. Our dataset is available at https://veatic.github.io.\n","authors":["Zhihang Ren","Jefferson Ortega","Yifan Wang","Zhimin Chen","Yunhui Guo","Stella X. Yu","David Whitney"],"pdf_url":"https://arxiv.org/pdf/2309.06745v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.09858v3","updated":"2023-09-14T03:37:24Z","published":"2023-03-17T09:37:41Z","title":"Preventing Unauthorized AI Over-Analysis by Medical Image Adversarial\n Watermarking","summary":" The advancement of deep learning has facilitated the integration of\nArtificial Intelligence (AI) into clinical practices, particularly in\ncomputer-aided diagnosis. Given the pivotal role of medical images in various\ndiagnostic procedures, it becomes imperative to ensure the responsible and\nsecure utilization of AI techniques. However, the unauthorized utilization of\nAI for image analysis raises significant concerns regarding patient privacy and\npotential infringement on the proprietary rights of data custodians.\nConsequently, the development of pragmatic and cost-effective strategies that\nsafeguard patient privacy and uphold medical image copyrights emerges as a\ncritical necessity. In direct response to this pressing demand, we present a\npioneering solution named Medical Image Adversarial watermarking (MIAD-MARK).\nOur approach introduces watermarks that strategically mislead unauthorized AI\ndiagnostic models, inducing erroneous predictions without compromising the\nintegrity of the visual content. Importantly, our method integrates an\nauthorization protocol tailored for legitimate users, enabling the removal of\nthe MIAD-MARK through encryption-generated keys. Through extensive experiments,\nwe validate the efficacy of MIAD-MARK across three prominent medical image\ndatasets. The empirical outcomes demonstrate the substantial impact of our\napproach, notably reducing the accuracy of standard AI diagnostic models to a\nmere 8.57% under white box conditions and 45.83% in the more challenging black\nbox scenario. Additionally, our solution effectively mitigates unauthorized\nexploitation of medical images even in the presence of sophisticated watermark\nremoval networks. Notably, those AI diagnosis networks exhibit a meager average\naccuracy of 38.59% when applied to images protected by MIAD-MARK, underscoring\nthe robustness of our safeguarding mechanism.\n","authors":["Xingxing Wei","Bangzheng Pu","Shiji Zhao","Chen Chi","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2303.09858v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07376v1","updated":"2023-09-14T01:39:40Z","published":"2023-09-14T01:39:40Z","title":"VCD: A Video Conferencing Dataset for Video Compression","summary":" Commonly used datasets for evaluating video codecs are all very high quality\nand not representative of video typically used in video conferencing scenarios.\nWe present the Video Conferencing Dataset (VCD) for evaluating video codecs for\nreal-time communication, the first such dataset focused on video conferencing.\nVCD includes a wide variety of camera qualities and spatial and temporal\ninformation. It includes both desktop and mobile scenarios and two types of\nvideo background processing. We report the compression efficiency of H.264,\nH.265, H.266, and AV1 in low-delay settings on VCD and compare it with the\nnon-video conferencing datasets UVC, MLC-JVC, and HEVC. The results show the\nsource quality and the scenarios have a significant effect on the compression\nefficiency of all the codecs. VCD enables the evaluation and tuning of codecs\nfor this important scenario. The VCD is publicly available as an open-source\ndataset at https://github.com/microsoft/VCD.\n","authors":["Babak Naderi","Ross Cutler","Nabakumar Singh Khongbantabam","Yasaman Hosseinkashi"],"pdf_url":"https://arxiv.org/pdf/2309.07376v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 65 + +
+
+
+ + ☆ MMICL: Empowering Vision-language Model with Multi-Modal In-Context + Learning + + +
+ Starting from the resurgence of deep learning, vision-language models (VLMs) +benefiting from large language models (LLMs) have never been so popular. +However, while LLMs can utilize extensive background knowledge and task +information with in-context learning, most VLMs still struggle with +understanding complex multi-modal prompts with multiple images. The issue can +traced back to the architectural design of VLMs or pre-training data. +Specifically, the current VLMs primarily emphasize utilizing multi-modal data +with a single image some, rather than multi-modal prompts with interleaved +multiple images and text. Even though some newly proposed VLMs could handle +user prompts with multiple images, pre-training data does not provide more +sophisticated multi-modal prompts than interleaved image and text crawled from +the web. We propose MMICL to address the issue by considering both the model +and data perspectives. We introduce a well-designed architecture capable of +seamlessly integrating visual and textual context in an interleaved manner and +MIC dataset to reduce the gap between the training data and the complex user +prompts in real-world applications, including: 1) multi-modal context with +interleaved images and text, 2) textual references for each image, and 3) +multi-image data with spatial, logical, or temporal relationships. Our +experiments confirm that MMICL achieves new stat-of-the-art zero-shot and +few-shot performance on a wide range of general vision-language tasks, +especially for complex reasoning benchmarks including MME and MMBench. Our +analysis demonstrates that MMICL effectively deals with the challenge of +complex multi-modal prompt understanding. The experiments on ScienceQA-IMG also +show that MMICL successfully alleviates the issue of language bias in VLMs, +which we believe is the reason behind the advanced performance of MMICL. + +
+
+ comment: Code, dataset, checkpoints, and demos are available at + \href{https://github.com/HaozheZhao/MIC}{https://github.com/HaozheZhao/MIC} +
+
+
+
+
+ + ☆ Ambiguity-Aware In-Context Learning with Large Language Models + + +
+ In-context learning (ICL) i.e. showing LLMs only a few task-specific +demonstrations has led to downstream gains with no task-specific fine-tuning +required. However, LLMs are sensitive to the choice of prompts, and therefore a +crucial research question is how to select good demonstrations for ICL. One +effective strategy is leveraging semantic similarity between the ICL +demonstrations and test inputs by using a text retriever, which however is +sub-optimal as that does not consider the LLM's existing knowledge about that +task. From prior work (Min et al., 2022), we already know that labels paired +with the demonstrations bias the model predictions. This leads us to our +hypothesis whether considering LLM's existing knowledge about the task, +especially with respect to the output label space can help in a better +demonstration selection strategy. Through extensive experimentation on three +text classification tasks, we find that it is beneficial to not only choose +semantically similar ICL demonstrations but also to choose those demonstrations +that help resolve the inherent label ambiguity surrounding the test example. +Interestingly, we find that including demonstrations that the LLM previously +mis-classified and also fall on the test example's decision boundary, brings +the most performance gain. + +
+
+ comment: 13 pages in total +
+
+
+
+
+ + ☆ Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language + Models that Follow Instructions + + +
+ Training large language models to follow instructions makes them perform +better on a wide range of tasks, generally becoming more helpful. However, a +perfectly helpful model will follow even the most malicious instructions and +readily generate harmful content. In this paper, we raise concerns over the +safety of models that only emphasize helpfulness, not safety, in their +instruction-tuning. We show that several popular instruction-tuned models are +highly unsafe. Moreover, we show that adding just 3% safety examples (a few +hundred demonstrations) in the training set when fine-tuning a model like LLaMA +can substantially improve their safety. Our safety-tuning does not make models +significantly less capable or helpful as measured by standard benchmarks. +However, we do find a behavior of exaggerated safety, where too much +safety-tuning makes models refuse to respond to reasonable prompts that +superficially resemble unsafe ones. Our study sheds light on trade-offs in +training LLMs to follow instructions and exhibit safe behavior. + +
+
+
+
+
+ + ☆ Agents: An Open-source Framework for Autonomous Language Agents + + +
+ Recent advances on large language models (LLMs) enable researchers and +developers to build autonomous language agents that can automatically solve +various tasks and interact with environments, humans, and other agents using +natural language interfaces. We consider language agents as a promising +direction towards artificial general intelligence and release Agents, an +open-source library with the goal of opening up these advances to a wider +non-specialist audience. Agents is carefully engineered to support important +features including planning, memory, tool usage, multi-agent communication, and +fine-grained symbolic control. Agents is user-friendly as it enables +non-specialists to build, customize, test, tune, and deploy state-of-the-art +autonomous language agents without much coding. The library is also +research-friendly as its modularized design makes it easily extensible for +researchers. Agents is available at https://github.com/aiwaves-cn/agents. + +
+
+ comment: Code available at https://github.com/aiwaves-cn/agents +
+
+
+
+
+ + ☆ The Rise and Potential of Large Language Model Based Agents: A Survey + + +
+ For a long time, humanity has pursued artificial intelligence (AI) equivalent +to or surpassing the human level, with AI agents considered a promising vehicle +for this pursuit. AI agents are artificial entities that sense their +environment, make decisions, and take actions. Many efforts have been made to +develop intelligent AI agents since the mid-20th century. However, these +efforts have mainly focused on advancement in algorithms or training strategies +to enhance specific capabilities or performance on particular tasks. Actually, +what the community lacks is a sufficiently general and powerful model to serve +as a starting point for designing AI agents that can adapt to diverse +scenarios. Due to the versatile and remarkable capabilities they demonstrate, +large language models (LLMs) are regarded as potential sparks for Artificial +General Intelligence (AGI), offering hope for building general AI agents. Many +research efforts have leveraged LLMs as the foundation to build AI agents and +have achieved significant progress. We start by tracing the concept of agents +from its philosophical origins to its development in AI, and explain why LLMs +are suitable foundations for AI agents. Building upon this, we present a +conceptual framework for LLM-based agents, comprising three main components: +brain, perception, and action, and the framework can be tailored to suit +different applications. Subsequently, we explore the extensive applications of +LLM-based agents in three aspects: single-agent scenarios, multi-agent +scenarios, and human-agent cooperation. Following this, we delve into agent +societies, exploring the behavior and personality of LLM-based agents, the +social phenomena that emerge when they form societies, and the insights they +offer for human society. Finally, we discuss a range of key topics and open +problems within the field. + +
+
+ comment: 86 pages, 12 figures +
+
+
+
+
+ + ☆ CiwaGAN: Articulatory information exchange + + +
+ Humans encode information into sounds by controlling articulators and decode +information from sounds using the auditory apparatus. This paper introduces +CiwaGAN, a model of human spoken language acquisition that combines +unsupervised articulatory modeling with an unsupervised model of information +exchange through the auditory modality. While prior research includes +unsupervised articulatory modeling and information exchange separately, our +model is the first to combine the two components. The paper also proposes an +improved articulatory model with more interpretable internal representations. +The proposed CiwaGAN model is the most realistic approximation of human spoken +language acquisition using deep learning. As such, it is useful for cognitively +plausible simulations of the human speech act. + +
+
+
+
+
+ + ☆ ExpertQA: Expert-Curated Questions and Attributed Answers + + +
+ As language models are adapted by a more sophisticated and diverse set of +users, the importance of guaranteeing that they provide factually correct +information supported by verifiable sources is critical across fields of study +& professions. This is especially the case for high-stakes fields, such as +medicine and law, where the risk of propagating false information is high and +can lead to undesirable societal consequences. Previous work studying +factuality and attribution has not focused on analyzing these characteristics +of language model outputs in domain-specific scenarios. In this work, we +present an evaluation study analyzing various axes of factuality and +attribution provided in responses from a few systems, by bringing domain +experts in the loop. Specifically, we first collect expert-curated questions +from 484 participants across 32 fields of study, and then ask the same experts +to evaluate generated responses to their own questions. We also ask experts to +revise answers produced by language models, which leads to ExpertQA, a +high-quality long-form QA dataset with 2177 questions spanning 32 fields, along +with verified answers and attributions for claims in the answers. + +
+
+ comment: Dataset & code is available at + https://github.com/chaitanyamalaviya/expertqa +
+
+
+
+
+ + ☆ CATfOOD: Counterfactual Augmented Training for Improving Out-of-Domain + Performance and Calibration + + +
+ In recent years, large language models (LLMs) have shown remarkable +capabilities at scale, particularly at generating text conditioned on a prompt. +In our work, we investigate the use of LLMs to augment training data of small +language models~(SLMs) with automatically generated counterfactual~(CF) +instances -- i.e. minimally altered inputs -- in order to improve +out-of-domain~(OOD) performance of SLMs in the extractive question +answering~(QA) setup. We show that, across various LLM generators, such data +augmentation consistently enhances OOD performance and improves model +calibration for both confidence-based and rationale-augmented calibrator +models. Furthermore, these performance improvements correlate with higher +diversity of CF instances in terms of their surface form and semantic content. +Finally, we show that CF augmented models which are easier to calibrate also +exhibit much lower entropy when assigning importance, indicating that +rationale-augmented calibrators prefer concise explanations. + +
+
+ comment: We make our code available at: https://github.com/UKPLab/CATfOOD +
+
+
+
+
+ + ☆ Text Classification of Cancer Clinical Trial Eligibility Criteria + + +
+ Automatic identification of clinical trials for which a patient is eligible +is complicated by the fact that trial eligibility is stated in natural +language. A potential solution to this problem is to employ text classification +methods for common types of eligibility criteria. In this study, we focus on +seven common exclusion criteria in cancer trials: prior malignancy, human +immunodeficiency virus, hepatitis B, hepatitis C, psychiatric illness, +drug/substance abuse, and autoimmune illness. Our dataset consists of 764 phase +III cancer trials with these exclusions annotated at the trial level. We +experiment with common transformer models as well as a new pre-trained clinical +trial BERT model. Our results demonstrate the feasibility of automatically +classifying common exclusion criteria. Additionally, we demonstrate the value +of a pre-trained language model specifically for clinical trials, which yields +the highest average performance across all criteria. + +
+
+ comment: AMIA Annual Symposium Proceedings 2023 +
+
+
+
+
+ + ☆ Pop Quiz! Do Pre-trained Code Models Possess Knowledge of Correct API + Names? + + +
+ Recent breakthroughs in pre-trained code models, such as CodeBERT and Codex, +have shown their superior performance in various downstream tasks. The +correctness and unambiguity of API usage among these code models are crucial +for achieving desirable program functionalities, requiring them to learn +various API fully qualified names structurally and semantically. Recent studies +reveal that even state-of-the-art pre-trained code models struggle with +suggesting the correct APIs during code generation. However, the reasons for +such poor API usage performance are barely investigated. To address this +challenge, we propose using knowledge probing as a means of interpreting code +models, which uses cloze-style tests to measure the knowledge stored in models. +Our comprehensive study examines a code model's capability of understanding API +fully qualified names from two different perspectives: API call and API import. +Specifically, we reveal that current code models struggle with understanding +API names, with pre-training strategies significantly affecting the quality of +API name learning. We demonstrate that natural language context can assist code +models in locating Python API names and generalize Python API name knowledge to +unseen data. Our findings provide insights into the limitations and +capabilities of current pre-trained code models, and suggest that incorporating +API structure into the pre-training process can improve automated API usage and +code representations. This work provides significance for advancing code +intelligence practices and direction for future studies. All experiment +results, data and source code used in this work are available at +\url{https://doi.org/10.5281/zenodo.7902072}. + +
+
+
+
+
+ + ☆ The Dynamical Principles of Storytelling + + +
+ When considering the opening part of 1800 short stories, we find that the +first dozen paragraphs of the average narrative follow an action principle as +defined in arXiv:2309.06600. When the order of the paragraphs is shuffled, the +average no longer exhibits this property. The findings show that there is a +preferential direction we take in semantic space when starting a story, +possibly related to a common Western storytelling tradition as implied by +Aristotle in Poetics. + +
+
+ comment: 6 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Improving Multimodal Classification of Social Media Posts by Leveraging + Image-Text Auxiliary tasks + + +
+ Effectively leveraging multimodal information from social media posts is +essential to various downstream tasks such as sentiment analysis, sarcasm +detection and hate speech classification. However, combining text and image +information is challenging because of the idiosyncratic cross-modal semantics +with hidden or complementary information present in matching image-text pairs. +In this work, we aim to directly model this by proposing the use of two +auxiliary losses jointly with the main task when fine-tuning any pre-trained +multimodal model. Image-Text Contrastive (ITC) brings image-text +representations of a post closer together and separates them from different +posts, capturing underlying dependencies. Image-Text Matching (ITM) facilitates +the understanding of semantic correspondence between images and text by +penalizing unrelated pairs. We combine these objectives with five multimodal +models, demonstrating consistent improvements across four popular social media +datasets. Furthermore, through detailed analysis, we shed light on the specific +scenarios and cases where each auxiliary task proves to be most effective. + +
+
+
+
+
+ + ☆ Usability Evaluation of Spoken Humanoid Embodied Conversational Agents + in Mobile Serious Games + + +
+ This paper presents an empirical investigation of the extent to which spoken +Humanoid Embodied Conversational Agents (HECAs) can foster usability in mobile +serious game (MSG) applications. The aim of the research is to assess the +impact of multiple agents and illusion of humanness on the quality of the +interaction. The experiment investigates two styles of agent presentation: an +agent of high human-likeness (HECA) and an agent of low human-likeness (text). +The purpose of the experiment is to assess whether and how agents of high +humanlikeness can evoke the illusion of humanness and affect usability. Agents +of high human-likeness were designed by following the ECA design model that is +a proposed guide for ECA development. The results of the experiment with 90 +participants show that users prefer to interact with the HECAs. The difference +between the two versions is statistically significant with a large effect size +(d=1.01), with many of the participants justifying their choice by saying that +the human-like characteristics of the HECA made the version more appealing. +This research provides key information on the potential effect of HECAs on +serious games, which can provide insight into the design of future mobile +serious games. + +
+
+ comment: 45 pages, 9 figures, 14 tables +
+
+
+
+
+ + ☆ Echotune: A Modular Extractor Leveraging the Variable-Length Nature of + Speech in ASR Tasks + + +
+ The Transformer architecture has proven to be highly effective for Automatic +Speech Recognition (ASR) tasks, becoming a foundational component for a +plethora of research in the domain. Historically, many approaches have leaned +on fixed-length attention windows, which becomes problematic for varied speech +samples in duration and complexity, leading to data over-smoothing and neglect +of essential long-term connectivity. Addressing this limitation, we introduce +Echo-MSA, a nimble module equipped with a variable-length attention mechanism +that accommodates a range of speech sample complexities and durations. This +module offers the flexibility to extract speech features across various +granularities, spanning from frames and phonemes to words and discourse. The +proposed design captures the variable length feature of speech and addresses +the limitations of fixed-length attention. Our evaluation leverages a parallel +attention architecture complemented by a dynamic gating mechanism that +amalgamates traditional attention with the Echo-MSA module output. Empirical +evidence from our study reveals that integrating Echo-MSA into the primary +model's training regime significantly enhances the word error rate (WER) +performance, all while preserving the intrinsic stability of the original +model. + +
+
+
+
+
+ + ☆ PROGrasp: Pragmatic Human-Robot Communication for Object Grasping + + +
+ Interactive Object Grasping (IOG) is the task of identifying and grasping the +desired object via human-robot natural language interaction. Current IOG +systems assume that a human user initially specifies the target object's +category (e.g., bottle). Inspired by pragmatics, where humans often convey +their intentions by relying on context to achieve goals, we introduce a new IOG +task, Pragmatic-IOG, and the corresponding dataset, Intention-oriented +Multi-modal Dialogue (IM-Dial). In our proposed task scenario, an +intention-oriented utterance (e.g., "I am thirsty") is initially given to the +robot. The robot should then identify the target object by interacting with a +human user. Based on the task setup, we propose a new robotic system that can +interpret the user's intention and pick up the target object, Pragmatic Object +Grasping (PROGrasp). PROGrasp performs Pragmatic-IOG by incorporating modules +for visual grounding, question asking, object grasping, and most importantly, +answer interpretation for pragmatic inference. Experimental results show that +PROGrasp is effective in offline (i.e., target object discovery) and online +(i.e., IOG with a physical robot arm) settings. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Generative AI Text Classification using Ensemble LLM Approaches + + +
+ Large Language Models (LLMs) have shown impressive performance across a +variety of Artificial Intelligence (AI) and natural language processing tasks, +such as content creation, report generation, etc. However, unregulated malign +application of these models can create undesirable consequences such as +generation of fake news, plagiarism, etc. As a result, accurate detection of +AI-generated language can be crucial in responsible usage of LLMs. In this +work, we explore 1) whether a certain body of text is AI generated or written +by human, and 2) attribution of a specific language model in generating a body +of text. Texts in both English and Spanish are considered. The datasets used in +this study are provided as part of the Automated Text Identification +(AuTexTification) shared task. For each of the research objectives stated +above, we propose an ensemble neural model that generates probabilities from +different pre-trained LLMs which are used as features to a Traditional Machine +Learning (TML) classifier following it. For the first task of distinguishing +between AI and human generated text, our model ranked in fifth and thirteenth +place (with macro $F1$ scores of 0.733 and 0.649) for English and Spanish +texts, respectively. For the second task on model attribution, our model ranked +in first place with macro $F1$ scores of 0.625 and 0.653 for English and +Spanish texts, respectively. + +
+
+
+
+
+ + ☆ The complementary roles of non-verbal cues for Robust Pronunciation + Assessment ICASSP 2024 + + +
+ Research on pronunciation assessment systems focuses on utilizing phonetic +and phonological aspects of non-native (L2) speech, often neglecting the rich +layer of information hidden within the non-verbal cues. In this study, we +proposed a novel pronunciation assessment framework, IntraVerbalPA. % The +framework innovatively incorporates both fine-grained frame- and abstract +utterance-level non-verbal cues, alongside the conventional speech and phoneme +representations. Additionally, we introduce ''Goodness of phonemic-duration'' +metric to effectively model duration distribution within the framework. Our +results validate the effectiveness of the proposed IntraVerbalPA framework and +its individual components, yielding performance that either matches or +outperforms existing research works. + +
+
+ comment: 5 pages, submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Explaining Speech Classification Models via Word-Level Audio Segments + and Paralinguistic Features + + +
+ Recent advances in eXplainable AI (XAI) have provided new insights into how +models for vision, language, and tabular data operate. However, few approaches +exist for understanding speech models. Existing work focuses on a few spoken +language understanding (SLU) tasks, and explanations are difficult to interpret +for most users. We introduce a new approach to explain speech classification +models. We generate easy-to-interpret explanations via input perturbation on +two information levels. 1) Word-level explanations reveal how each word-related +audio segment impacts the outcome. 2) Paralinguistic features (e.g., prosody +and background noise) answer the counterfactual: ``What would the model +prediction be if we edited the audio signal in this way?'' We validate our +approach by explaining two state-of-the-art SLU models on two speech +classification tasks in English and Italian. Our findings demonstrate that the +explanations are faithful to the model's inner workings and plausible to +humans. Our method and findings pave the way for future research on +interpreting speech models. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ PerPLM: Personalized Fine-tuning of Pretrained Language Models via + Writer-specific Intermediate Learning and Prompts + + +
+ The meanings of words and phrases depend not only on where they are used +(contexts) but also on who use them (writers). Pretrained language models +(PLMs) are powerful tools for capturing context, but they are typically +pretrained and fine-tuned for universal use across different writers. This +study aims to improve the accuracy of text understanding tasks by personalizing +the fine-tuning of PLMs for specific writers. We focus on a general setting +where only the plain text from target writers are available for +personalization. To avoid the cost of fine-tuning and storing multiple copies +of PLMs for different users, we exhaustively explore using writer-specific +prompts to personalize a unified PLM. Since the design and evaluation of these +prompts is an underdeveloped area, we introduce and compare different types of +prompts that are possible in our setting. To maximize the potential of +prompt-based personalized fine-tuning, we propose a personalized intermediate +learning based on masked language modeling to extract task-independent traits +of writers' text. Our experiments, using multiple tasks, datasets, and PLMs, +reveal the nature of different prompts and the effectiveness of our +intermediate learning approach. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ L1-aware Multilingual Mispronunciation Detection Framework ICASSP 2024 + + +
+ The phonological discrepancies between a speaker's native (L1) and the +non-native language (L2) serves as a major factor for mispronunciation. This +paper introduces a novel multilingual MDD architecture, L1-MultiMDD, enriched +with L1-aware speech representation. An end-to-end speech encoder is trained on +the input signal and its corresponding reference phoneme sequence. First, an +attention mechanism is deployed to align the input audio with the reference +phoneme sequence. Afterwards, the L1-L2-speech embedding are extracted from an +auxiliary model, pretrained in a multi-task setup identifying L1 and L2 +language, and are infused with the primary network. Finally, the L1-MultiMDD is +then optimized for a unified multilingual phoneme recognition task using +connectionist temporal classification (CTC) loss for the target languages: +English, Arabic, and Mandarin. Our experiments demonstrate the effectiveness of +the proposed L1-MultiMDD framework on both seen -- L2-ARTIC, LATIC, and +AraVoiceL2v2; and unseen -- EpaDB and Speechocean762 datasets. The consistent +gains in PER, and false rejection rate (FRR) across all target languages +confirm our approach's robustness, efficacy, and generalizability. + +
+
+ comment: 5 papers, submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ CoLLD: Contrastive Layer-to-layer Distillation for Compressing + Multilingual Pre-trained Speech Encoders ICASSP 2024 + + +
+ Large-scale self-supervised pre-trained speech encoders outperform +conventional approaches in speech recognition and translation tasks. Due to the +high cost of developing these large models, building new encoders for new tasks +and deploying them to on-device applications are infeasible. Prior studies +propose model compression methods to address this issue, but those works focus +on smaller models and less realistic tasks. Thus, we propose Contrastive +Layer-to-layer Distillation (CoLLD), a novel knowledge distillation method to +compress pre-trained speech encoders by leveraging masked prediction and +contrastive learning to train student models to copy the behavior of a large +teacher model. CoLLD outperforms prior methods and closes the gap between small +and large models on multilingual speech-to-text translation and recognition +benchmarks. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Tree of Uncertain Thoughts Reasoning for Large Language Models + + +
+ While the recently introduced Tree of Thoughts (ToT) has heralded +advancements in allowing Large Language Models (LLMs) to reason through +foresight and backtracking for global decision-making, it has overlooked the +inherent local uncertainties in intermediate decision points or "thoughts". +These local uncertainties, intrinsic to LLMs given their potential for diverse +responses, remain a significant concern in the reasoning process. Addressing +this pivotal gap, we introduce the Tree of Uncertain Thoughts (TouT) - a +reasoning framework tailored for LLMs. Our TouT effectively leverages Monte +Carlo Dropout to quantify uncertainty scores associated with LLMs' diverse +local responses at these intermediate steps. By marrying this local uncertainty +quantification with global search algorithms, TouT enhances the model's +precision in response generation. We substantiate our approach with rigorous +experiments on two demanding planning tasks: Game of 24 and Mini Crosswords. +The empirical evidence underscores TouT's superiority over both ToT and +chain-of-thought prompting methods. + +
+
+
+
+
+ + ☆ Detecting ChatGPT: A Survey of the State of Detecting ChatGPT-Generated + Text + + +
+ While recent advancements in the capabilities and widespread accessibility of +generative language models, such as ChatGPT (OpenAI, 2022), have brought about +various benefits by generating fluent human-like text, the task of +distinguishing between human- and large language model (LLM) generated text has +emerged as a crucial problem. These models can potentially deceive by +generating artificial text that appears to be human-generated. This issue is +particularly significant in domains such as law, education, and science, where +ensuring the integrity of text is of the utmost importance. This survey +provides an overview of the current approaches employed to differentiate +between texts generated by humans and ChatGPT. We present an account of the +different datasets constructed for detecting ChatGPT-generated text, the +various methods utilized, what qualitative analyses into the characteristics of +human versus ChatGPT-generated text have been performed, and finally, summarize +our findings into general insights + +
+
+ comment: Published in the Proceedings of the Student Research Workshop + associated with RANLP-2023 +
+
+
+
+
+ + ☆ Assessing the nature of large language models: A caution against + anthropocentrism + + +
+ Generative AI models garnered a large amount of public attention and +speculation with the release of OpenAIs chatbot, ChatGPT. At least two opinion +camps exist: one excited about possibilities these models offer for fundamental +changes to human tasks, and another highly concerned about power these models +seem to have. To address these concerns, we assessed GPT3.5 using standard, +normed, and validated cognitive and personality measures. For this seedling +project, we developed a battery of tests that allowed us to estimate the +boundaries of some of these models capabilities, how stable those capabilities +are over a short period of time, and how they compare to humans. + Our results indicate that GPT 3.5 is unlikely to have developed sentience, +although its ability to respond to personality inventories is interesting. It +did display large variability in both cognitive and personality measures over +repeated observations, which is not expected if it had a human-like +personality. Variability notwithstanding, GPT3.5 displays what in a human would +be considered poor mental health, including low self-esteem and marked +dissociation from reality despite upbeat and helpful responses. + +
+
+ comment: 30 pages, 6 figures +
+
+
+
+
+ + ☆ A Conversation is Worth A Thousand Recommendations: A Survey of Holistic + Conversational Recommender Systems RecSys 2023 + + +
+ Conversational recommender systems (CRS) generate recommendations through an +interactive process. However, not all CRS approaches use human conversations as +their source of interaction data; the majority of prior CRS work simulates +interactions by exchanging entity-level information. As a result, claims of +prior CRS work do not generalise to real-world settings where conversations +take unexpected turns, or where conversational and intent understanding is not +perfect. To tackle this challenge, the research community has started to +examine holistic CRS, which are trained using conversational data collected +from real-world scenarios. Despite their emergence, such holistic approaches +are under-explored. + We present a comprehensive survey of holistic CRS methods by summarizing the +literature in a structured manner. Our survey recognises holistic CRS +approaches as having three components: 1) a backbone language model, the +optional use of 2) external knowledge, and/or 3) external guidance. We also +give a detailed analysis of CRS datasets and evaluation methods in real +application scenarios. We offer our insight as to the current challenges of +holistic CRS and possible future trends. + +
+
+ comment: Accepted by 5th KaRS Workshop @ ACM RecSys 2023, 8 pages +
+
+
+
+
+ + ☆ Aligning Speakers: Evaluating and Visualizing Text-based Diarization + Using Efficient Multiple Sequence Alignment (Extended Version) ICTAI + + +
+ This paper presents a novel evaluation approach to text-based speaker +diarization (SD), tackling the limitations of traditional metrics that do not +account for any contextual information in text. Two new metrics are proposed, +Text-based Diarization Error Rate and Diarization F1, which perform utterance- +and word-level evaluations by aligning tokens in reference and hypothesis +transcripts. Our metrics encompass more types of errors compared to existing +ones, allowing us to make a more comprehensive analysis in SD. To align tokens, +a multiple sequence alignment algorithm is introduced that supports multiple +sequences in the reference while handling high-dimensional alignment to the +hypothesis using dynamic programming. Our work is packaged into two tools, +align4d providing an API for our alignment algorithm and TranscribeView for +visualizing and evaluating SD errors, which can greatly aid in the creation of +high-quality data, fostering the advancement of dialogue systems. + +
+
+ comment: Accepted to the 35th IEEE International Conference on Tools with + Artificial Intelligence (ICTAI) 2023 +
+
+
+
+
+ + ☆ Automatic Data Visualization Generation from Chinese Natural Language + Questions + + +
+ Data visualization has emerged as an effective tool for getting insights from +massive datasets. Due to the hardness of manipulating the programming languages +of data visualization, automatic data visualization generation from natural +languages (Text-to-Vis) is becoming increasingly popular. Despite the plethora +of research effort on the English Text-to-Vis, studies have yet to be conducted +on data visualization generation from questions in Chinese. Motivated by this, +we propose a Chinese Text-to-Vis dataset in the paper and demonstrate our first +attempt to tackle this problem. Our model integrates multilingual BERT as the +encoder, boosts the cross-lingual ability, and infuses the $n$-gram information +into our word representation learning. Our experimental results show that our +dataset is challenging and deserves further research. + +
+
+
+
+
+ + ☆ Incorporating Class-based Language Model for Named Entity Recognition in + Factorized Neural Transducer + + +
+ In spite of the excellent strides made by end-to-end (E2E) models in speech +recognition in recent years, named entity recognition is still challenging but +critical for semantic understanding. In order to enhance the ability to +recognize named entities in E2E models, previous studies mainly focus on +various rule-based or attention-based contextual biasing algorithms. However, +their performance might be sensitive to the biasing weight or degraded by +excessive attention to the named entity list, along with a risk of false +triggering. Inspired by the success of the class-based language model (LM) in +named entity recognition in conventional hybrid systems and the effective +decoupling of acoustic and linguistic information in the factorized neural +Transducer (FNT), we propose a novel E2E model to incorporate class-based LMs +into FNT, which is referred as C-FNT. In C-FNT, the language model score of +named entities can be associated with the name class instead of its surface +form. The experimental results show that our proposed C-FNT presents +significant error reduction in named entities without hurting performance in +general word recognition. + +
+
+
+
+
+ + ☆ Dynamic MOdularized Reasoning for Compositional Structured Explanation + Generation + + +
+ Despite the success of neural models in solving reasoning tasks, their +compositional generalization capabilities remain unclear. In this work, we +propose a new setting of the structured explanation generation task to +facilitate compositional reasoning research. Previous works found that symbolic +methods achieve superior compositionality by using pre-defined inference rules +for iterative reasoning. But these approaches rely on brittle symbolic +transfers and are restricted to well-defined tasks. Hence, we propose a dynamic +modularized reasoning model, MORSE, to improve the compositional generalization +of neural models. MORSE factorizes the inference process into a combination of +modules, where each module represents a functional unit. Specifically, we adopt +modularized self-attention to dynamically select and route inputs to dedicated +heads, which specializes them to specific functions. We conduct experiments for +increasing lengths and shapes of reasoning trees on two benchmarks to test +MORSE's compositional generalization abilities, and find it outperforms +competitive baselines. Model ablation and deeper analyses show the +effectiveness of dynamic reasoning modules and their generalization abilities. + +
+
+
+
+
+ + ☆ Zero-shot Audio Topic Reranking using Large Language Models + + +
+ The Multimodal Video Search by Examples (MVSE) project investigates using +video clips as the query term for information retrieval, rather than the more +traditional text query. This enables far richer search modalities such as +images, speaker, content, topic, and emotion. A key element for this process is +highly rapid, flexible, search to support large archives, which in MVSE is +facilitated by representing video attributes by embeddings. This work aims to +mitigate any performance loss from this rapid archive search by examining +reranking approaches. In particular, zero-shot reranking methods using large +language models are investigated as these are applicable to any video archive +audio content. Performance is evaluated for topic-based retrieval on a publicly +available video archive, the BBC Rewind corpus. Results demonstrate that +reranking can achieve improved retrieval ranking without the need for any +task-specific training data. + +
+
+
+
+
+ + ☆ Detecting Misinformation with LLM-Predicted Credibility Signals and Weak + Supervision + + +
+ Credibility signals represent a wide range of heuristics that are typically +used by journalists and fact-checkers to assess the veracity of online content. +Automating the task of credibility signal extraction, however, is very +challenging as it requires high-accuracy signal-specific extractors to be +trained, while there are currently no sufficiently large datasets annotated +with all credibility signals. This paper investigates whether large language +models (LLMs) can be prompted effectively with a set of 18 credibility signals +to produce weak labels for each signal. We then aggregate these potentially +noisy labels using weak supervision in order to predict content veracity. We +demonstrate that our approach, which combines zero-shot LLM credibility signal +labeling and weak supervision, outperforms state-of-the-art classifiers on two +misinformation datasets without using any ground-truth labels for training. We +also analyse the contribution of the individual credibility signals towards +predicting content veracity, which provides new valuable insights into their +role in misinformation detection. + +
+
+
+
+
+ + ☆ C-Pack: Packaged Resources To Advance General Chinese Embedding + + +
+ We introduce C-Pack, a package of resources that significantly advance the +field of general Chinese embeddings. C-Pack includes three critical resources. +1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 +tasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated +from labeled and unlabeled Chinese corpora for training embedding models. 3) +C-TEM is a family of embedding models covering multiple sizes. Our models +outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the +time of the release. We also integrate and optimize the entire suite of +training methods for C-TEM. Along with our resources on general Chinese +embedding, we release our data and models for English text embeddings. The +English models achieve state-of-the-art performance on MTEB benchmark; +meanwhile, our released English data is 2 times larger than the Chinese data. +All these resources are made publicly available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+
+
+
+ + ☆ Revisiting Supertagging for HPSG + + +
+ We present new supertaggers trained on HPSG-based treebanks. These treebanks +feature high-quality annotation based on a well-developed linguistic theory and +include diverse and challenging test datasets, beyond the usual WSJ section 23 +and Wikipedia data. HPSG supertagging has previously relied on MaxEnt-based +models. We use SVM and neural CRF- and BERT-based methods and show that both +SVM and neural supertaggers achieve considerably higher accuracy compared to +the baseline. Our fine-tuned BERT-based tagger achieves 97.26% accuracy on 1000 +sentences from WSJ23 and 93.88% on the completely out-of-domain The Cathedral +and the Bazaar (cb)). We conclude that it therefore makes sense to integrate +these new supertaggers into modern HPSG parsers, and we also hope that the +diverse and difficult datasets we used here will gain more popularity in the +field. We contribute the complete dataset reformatted for token classification. + +
+
+ comment: 9 pages, 0 figures +
+
+
+
+
+ + ☆ Adaptive Prompt Learning with Distilled Connective Knowledge for + Implicit Discourse Relation Recognition + + +
+ Implicit discourse relation recognition (IDRR) aims at recognizing the +discourse relation between two text segments without an explicit connective. +Recently, the prompt learning has just been applied to the IDRR task with great +performance improvements over various neural network-based approaches. However, +the discrete nature of the state-art-of-art prompting approach requires manual +design of templates and answers, a big hurdle for its practical applications. +In this paper, we propose a continuous version of prompt learning together with +connective knowledge distillation, called AdaptPrompt, to reduce manual design +efforts via continuous prompting while further improving performance via +knowledge transfer. In particular, we design and train a few virtual tokens to +form continuous templates and automatically select the most suitable one by +gradient search in the embedding space. We also design an answer-relation +mapping rule to generate a few virtual answers as the answer space. +Furthermore, we notice the importance of annotated connectives in the training +dataset and design a teacher-student architecture for knowledge transfer. +Experiments on the up-to-date PDTB Corpus V3.0 validate our design objectives +in terms of the better relation recognition performance over the +state-of-the-art competitors. + +
+
+
+
+
+ + ☆ DBLPLink: An Entity Linker for the DBLP Scholarly Knowledge Graph ISWC + + +
+ In this work, we present a web application named DBLPLink, which performs +entity linking over the DBLP scholarly knowledge graph. DBLPLink uses +text-to-text pre-trained language models, such as T5, to produce entity label +spans from an input text question. Entity candidates are fetched from a +database based on the labels, and an entity re-ranker sorts them based on +entity embeddings, such as TransE, DistMult and ComplEx. The results are +displayed so that users may compare and contrast the results between T5-small, +T5-base and the different KG embeddings used. The demo can be accessed at +https://ltdemos.informatik.uni-hamburg.de/dblplink/. + +
+
+ comment: Accepted at International Semantic Web Conference (ISWC) 2023 Posters + & Demo Track +
+
+
+
+
+ + ☆ Direct Text to Speech Translation System using Acoustic Units + + +
+ This paper proposes a direct text to speech translation system using discrete +acoustic units. This framework employs text in different source languages as +input to generate speech in the target language without the need for text +transcriptions in this language. Motivated by the success of acoustic units in +previous works for direct speech to speech translation systems, we use the same +pipeline to extract the acoustic units using a speech encoder combined with a +clustering algorithm. Once units are obtained, an encoder-decoder architecture +is trained to predict them. Then a vocoder generates speech from units. Our +approach for direct text to speech translation was tested on the new CVSS +corpus with two different text mBART models employed as initialisation. The +systems presented report competitive performance for most of the language pairs +evaluated. Besides, results show a remarkable improvement when initialising our +proposed architecture with a model pre-trained with more languages. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Are Large Language Model-based Evaluators the Solution to Scaling Up + Multilingual Evaluation? + + +
+ Large Language Models (LLMs) have demonstrated impressive performance on +Natural Language Processing (NLP) tasks, such as Question Answering, +Summarization, and Classification. The use of LLMs as evaluators, that can rank +or score the output of other models (usually LLMs) has become increasingly +popular, due to the limitations of current evaluation techniques including the +lack of appropriate benchmarks, metrics, cost, and access to human annotators. +While LLMs are capable of handling approximately 100 languages, the majority of +languages beyond the top 20 lack systematic evaluation across various tasks, +metrics, and benchmarks. This creates an urgent need to scale up multilingual +evaluation to ensure a precise understanding of LLM performance across diverse +languages. LLM-based evaluators seem like the perfect solution to this problem, +as they do not require human annotators, human-created references, or +benchmarks and can theoretically be used to evaluate any language covered by +the LLM. In this paper, we investigate whether LLM-based evaluators can help +scale up multilingual evaluation. Specifically, we calibrate LLM-based +evaluation against 20k human judgments of five metrics across three +text-generation tasks in eight languages. Our findings indicate that LLM-based +evaluators may exhibit bias towards higher scores and should be used with +caution and should always be calibrated with a dataset of native speaker +judgments, particularly in low-resource and non-Latin script languages. + +
+
+
+
+
+ + ☆ SIB-200: A Simple, Inclusive, and Big Evaluation Dataset for Topic + Classification in 200+ Languages and Dialects + + +
+ Despite the progress we have recorded in the last few years in multilingual +natural language processing, evaluation is typically limited to a small set of +languages with available datasets which excludes a large number of low-resource +languages. In this paper, we created SIB-200 -- a large-scale open-sourced +benchmark dataset for topic classification in 200 languages and dialects to +address the lack of evaluation dataset for Natural Language Understanding +(NLU). For many of the languages covered in SIB-200, this is the first publicly +available evaluation dataset for NLU. The dataset is based on Flores-200 +machine translation corpus. We annotated the English portion of the dataset and +extended the sentence-level annotation to the remaining 203 languages covered +in the corpus. Despite the simplicity of this task, our evaluation in +full-supervised setting, cross-lingual transfer setting and prompting of large +language model setting show that there is still a large gap between the +performance of high-resource and low-resource languages when multilingual +evaluation is scaled to numerous world languages. We found that languages +unseen during the pre-training of multilingual language models, +under-represented language families (like Nilotic and Altantic-Congo), and +languages from the regions of Africa, Americas, Oceania and South East Asia, +often have the lowest performance on our topic classification dataset. We hope +our dataset will encourage a more inclusive evaluation of multilingual language +models on a more diverse set of languages. https://github.com/dadelani/sib-200 + +
+
+ comment: under submission +
+
+
+
+
+ + ☆ Clinical Text Summarization: Adapting Large Language Models Can + Outperform Human Experts + + +
+ Sifting through vast textual data and summarizing key information imposes a +substantial burden on how clinicians allocate their time. Although large +language models (LLMs) have shown immense promise in natural language +processing (NLP) tasks, their efficacy across diverse clinical summarization +tasks has not yet been rigorously examined. In this work, we employ domain +adaptation methods on eight LLMs, spanning six datasets and four distinct +summarization tasks: radiology reports, patient questions, progress notes, and +doctor-patient dialogue. Our thorough quantitative assessment reveals +trade-offs between models and adaptation methods in addition to instances where +recent advances in LLMs may not lead to improved results. Further, in a +clinical reader study with six physicians, we depict that summaries from the +best adapted LLM are preferable to human summaries in terms of completeness and +correctness. Our ensuing qualitative analysis delineates mutual challenges +faced by both LLMs and human experts. Lastly, we correlate traditional +quantitative NLP metrics with reader study scores to enhance our understanding +of how these metrics align with physician preferences. Our research marks the +first evidence of LLMs outperforming human experts in clinical text +summarization across multiple tasks. This implies that integrating LLMs into +clinical workflows could alleviate documentation burden, empowering clinicians +to focus more on personalized patient care and other irreplaceable human +aspects of medicine. + +
+
+ comment: 23 pages, 22 figures +
+
+
+
+
+ + ☆ Semantic Parsing in Limited Resource Conditions + + +
+ This thesis explores challenges in semantic parsing, specifically focusing on +scenarios with limited data and computational resources. It offers solutions +using techniques like automatic data curation, knowledge transfer, active +learning, and continual learning. + For tasks with no parallel training data, the thesis proposes generating +synthetic training examples from structured database schemas. When there is +abundant data in a source domain but limited parallel data in a target domain, +knowledge from the source is leveraged to improve parsing in the target domain. + For multilingual situations with limited data in the target languages, the +thesis introduces a method to adapt parsers using a limited human translation +budget. Active learning is applied to select source-language samples for manual +translation, maximizing parser performance in the target language. In addition, +an alternative method is also proposed to utilize machine translation services, +supplemented by human-translated data, to train a more effective parser. + When computational resources are limited, a continual learning approach is +introduced to minimize training time and computational memory. This maintains +the parser's efficiency in previously learned tasks while adapting it to new +tasks, mitigating the problem of catastrophic forgetting. + Overall, the thesis provides a comprehensive set of methods to improve +semantic parsing in resource-constrained conditions. + +
+
+ comment: PhD thesis, year of award 2023, 172 pages +
+
+
+
+
+ + ☆ ChatGPT MT: Competitive for High- (but not Low-) Resource Languages + + +
+ Large language models (LLMs) implicitly learn to perform a range of language +tasks, including machine translation (MT). Previous studies explore aspects of +LLMs' MT capabilities. However, there exist a wide variety of languages for +which recent LLM MT performance has never before been evaluated. Without +published experimental evidence on the matter, it is difficult for speakers of +the world's diverse languages to know how and whether they can use LLMs for +their languages. We present the first experimental evidence for an expansive +set of 204 languages, along with MT cost analysis, using the FLORES-200 +benchmark. Trends reveal that GPT models approach or exceed traditional MT +model performance for some high-resource languages (HRLs) but consistently lag +for low-resource languages (LRLs), under-performing traditional MT for 84.1% of +languages we covered. Our analysis reveals that a language's resource level is +the most important feature in determining ChatGPT's relative ability to +translate it, and suggests that ChatGPT is especially disadvantaged for LRLs +and African languages. + +
+
+ comment: 27 pages, 9 figures, 14 tables +
+
+
+
+
+ + ☆ PromptASR for contextualized ASR with controllable style ICASSP2024 + + +
+ Prompts are crucial to large language models as they provide context +information such as topic or logical relationships. Inspired by this, we +propose PromptASR, a framework that integrates prompts in end-to-end automatic +speech recognition (E2E ASR) systems to achieve contextualized ASR with +controllable style of transcriptions. Specifically, a dedicated text encoder +encodes the text prompts and the encodings are injected into the speech encoder +by cross-attending the features from two modalities. When using the ground +truth text from preceding utterances as content prompt, the proposed system +achieves 21.9% and 6.8% relative word error rate reductions on a book reading +dataset and an in-house dataset compared to a baseline ASR system. The system +can also take word-level biasing lists as prompt to improve recognition +accuracy on rare words. An additional style prompt can be given to the text +encoder and guide the ASR system to output different styles of transcriptions. +The code is available at icefall. + +
+
+ comment: Submitted to ICASSP2024 +
+
+
+
+
+ + ☆ CPPF: A contextual and post-processing-free model for automatic speech + recognition ICASSP2024 + + +
+ ASR systems have become increasingly widespread in recent years. However, +their textual outputs often require post-processing tasks before they can be +practically utilized. To address this issue, we draw inspiration from the +multifaceted capabilities of LLMs and Whisper, and focus on integrating +multiple ASR text processing tasks related to speech recognition into the ASR +model. This integration not only shortens the multi-stage pipeline, but also +prevents the propagation of cascading errors, resulting in direct generation of +post-processed text. In this study, we focus on ASR-related processing tasks, +including Contextual ASR and multiple ASR post processing tasks. To achieve +this objective, we introduce the CPPF model, which offers a versatile and +highly effective alternative to ASR processing. CPPF seamlessly integrates +these tasks without any significant loss in recognition performance. + +
+
+ comment: Submitted to ICASSP2024 +
+
+
+
+
+ + ☆ Advancing Regular Language Reasoning in Linear Recurrent Neural Networks + + +
+ In recent studies, linear recurrent neural networks (LRNNs) have achieved +Transformer-level performance in natural language modeling and long-range +modeling while offering rapid parallel training and constant inference costs. +With the resurged interest in LRNNs, we study whether they can learn the hidden +rules in training sequences, such as the grammatical structures of regular +language. We theoretically analyze some existing LRNNs and discover their +limitations on regular language. Motivated by the analysis, we propose a new +LRNN equipped with a block-diagonal and input-dependent transition matrix. +Experiments suggest that the proposed model is the only LRNN that can perform +length extrapolation on regular language tasks such as Sum, Even Pair, and +Modular Arithmetic. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ DebCSE: Rethinking Unsupervised Contrastive Sentence Embedding Learning + in the Debiasing Perspective + + +
+ Several prior studies have suggested that word frequency biases can cause the +Bert model to learn indistinguishable sentence embeddings. Contrastive learning +schemes such as SimCSE and ConSERT have already been adopted successfully in +unsupervised sentence embedding to improve the quality of embeddings by +reducing this bias. However, these methods still introduce new biases such as +sentence length bias and false negative sample bias, that hinders model's +ability to learn more fine-grained semantics. In this paper, we reexamine the +challenges of contrastive sentence embedding learning from a debiasing +perspective and argue that effectively eliminating the influence of various +biases is crucial for learning high-quality sentence embeddings. We think all +those biases are introduced by simple rules for constructing training data in +contrastive learning and the key for contrastive learning sentence embedding is +to mimic the distribution of training data in supervised machine learning in +unsupervised way. We propose a novel contrastive framework for sentence +embedding, termed DebCSE, which can eliminate the impact of these biases by an +inverse propensity weighted sampling method to select high-quality positive and +negative pairs according to both the surface and semantic similarity between +sentences. Extensive experiments on semantic textual similarity (STS) +benchmarks reveal that DebCSE significantly outperforms the latest +state-of-the-art models with an average Spearman's correlation coefficient of +80.33% on BERTbase. + +
+
+
+
+
+ + ☆ VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue + + +
+ Visually-grounded dialog systems, which integrate multiple modes of +communication such as text and visual inputs, have become an increasingly +popular area of investigation. However, the absence of a standardized +evaluation framework poses a challenge in assessing the development of this +field. To this end, we propose \textbf{VDialogUE}, a \textbf{V}isually-grounded +\textbf{Dialog}ue benchmark for \textbf{U}nified \textbf{E}valuation. It +defines five core multi-modal dialogue tasks and covers six datasets. +Furthermore, in order to provide a comprehensive assessment of the model's +performance across all tasks, we developed a novel evaluation metric called +VDscore, which is based on the Analytic Hierarchy Process~(AHP) method. +Additionally, we present a straightforward yet efficient baseline model, named +\textbf{VISIT}~(\textbf{VIS}ually-grounded d\textbf{I}alog +\textbf{T}ransformer), to promote the advancement of general multi-modal +dialogue systems. It progressively builds its multi-modal foundation and +dialogue capability via a two-stage pre-training strategy. + We believe that the VDialogUE benchmark, along with the evaluation scripts +and our baseline models, will accelerate the development of visually-grounded +dialog systems and lead to the development of more sophisticated and effective +pre-trained models. + +
+
+
+
+
+ + ☆ An Interactive Framework for Profiling News Media Sources + + +
+ The recent rise of social media has led to the spread of large amounts of +fake and biased news, content published with the intent to sway beliefs. While +detecting and profiling the sources that spread this news is important to +maintain a healthy society, it is challenging for automated systems. + In this paper, we propose an interactive framework for news media profiling. +It combines the strengths of graph based news media profiling models, +Pre-trained Large Language Models, and human insight to characterize the social +context on social media. Experimental results show that with as little as 5 +human interactions, our framework can rapidly detect fake and biased news +media, even in the most challenging settings of emerging news events, where +test data is unseen. + +
+
+
+
+
+ + ☆ Less is More for Long Document Summary Evaluation by LLMs + + +
+ Large Language Models (LLMs) have shown promising performance in summary +evaluation tasks, yet they face challenges such as high computational costs and +the Lost-in-the-Middle problem where important information in the middle of +long documents is often overlooked. To address these issues, this paper +introduces a novel approach, Extract-then-Evaluate, which involves extracting +key sentences from a long source document and then evaluating the summary by +prompting LLMs. The results reveal that the proposed method not only +significantly reduces evaluation costs but also exhibits a higher correlation +with human evaluations. Furthermore, we provide practical recommendations for +optimal document length and sentence extraction methods, contributing to the +development of cost-effective yet more accurate methods for LLM-based text +generation evaluation. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Hybrid Attention-based Encoder-decoder Model for Efficient Language + Model Adaptation + + +
+ Attention-based encoder-decoder (AED) speech recognition model has been +widely successful in recent years. However, the joint optimization of acoustic +model and language model in end-to-end manner has created challenges for text +adaptation. In particular, effectively, quickly and inexpensively adapting text +has become a primary concern for deploying AED systems in industry. To address +this issue, we propose a novel model, the hybrid attention-based +encoder-decoder (HAED) speech recognition model that preserves the modularity +of conventional hybrid automatic speech recognition systems. Our HAED model +separates the acoustic and language models, allowing for the use of +conventional text-based language model adaptation techniques. We demonstrate +that the proposed HAED model yields 21\% Word Error Rate (WER) improvements in +relative when out-of-domain text data is used for language model adaptation, +and with only a minor degradation in WER on a general test set compared with +conventional AED model. + +
+
+
+
+
+ + ♻ ☆ Qwen-VL: A Versatile Vision-Language Model for Understanding, + Localization, Text Reading, and Beyond + + +
+ We introduce the Qwen-VL series, a set of large-scale vision-language models +(LVLMs) designed to perceive and understand both text and images. Comprising +Qwen-VL and Qwen-VL-Chat, these models exhibit remarkable performance in tasks +like image captioning, question answering, visual localization, and flexible +interaction. The evaluation covers a wide range of tasks including zero-shot +captioning, visual or document visual question answering, and grounding. We +demonstrate the Qwen-VL outperforms existing LVLMs. We present their +architecture, training, capabilities, and performance, highlighting their +contributions to advancing multimodal artificial intelligence. Code, demo and +models are available at https://github.com/QwenLM/Qwen-VL. + +
+
+ comment: Code, demo and models are available at + https://github.com/QwenLM/Qwen-VL +
+
+
+
+
+ + ♻ ☆ Mitigating Hallucination in Large Multi-Modal Models via Robust + Instruction Tuning + + +
+ Despite the promising progress in multi-modal tasks, current large +multi-modal models (LMM) are prone to hallucinating inconsistent descriptions +with respect to the associated image and human instructions. This paper +addresses this issue by introducing the first large and diverse visual +instruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction. +Our dataset consists of 120k visual instructions generated by GPT4, covering 16 +vision-and-language tasks with open-ended instructions and answers. Unlike +existing studies that primarily focus on positive instruction samples, we +design LRV-Instruction to include both positive and negative instructions for +more robust visual instruction tuning. Our negative instructions are designed +at two semantic levels: (i) Nonexistent Element Manipulation and (ii) Existent +Element Manipulation. To efficiently measure the hallucination generated by +LMMs, we propose GPT4-Assisted Visual Instruction Evaluation (GAVIE), a novel +approach to evaluate visual instruction tuning without the need for +human-annotated groundtruth answers and can adapt to diverse instruction +formats. We conduct comprehensive experiments to investigate the hallucination +of LMMs. Our results demonstrate that existing LMMs exhibit significant +hallucination when presented with our negative instructions, particularly with +Existent Element Manipulation instructions. Moreover, by finetuning MiniGPT4 on +LRV-Instruction, we successfully mitigate hallucination while improving +performance on public datasets using less training data compared to +state-of-the-art methods. Additionally, we observed that a balanced ratio of +positive and negative instances in the training data leads to a more robust +model. Updates of our project are available at +https://fuxiaoliu.github.io/LRV/. + +
+
+ comment: 35 pages, 27 figures. Under Review +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Embeddings for Multi-Lingual Structured Representations + of Radiology Reports + + +
+ The way we analyse clinical texts has undergone major changes over the last +years. The introduction of language models such as BERT led to adaptations for +the (bio)medical domain like PubMedBERT and ClinicalBERT. These models rely on +large databases of archived medical documents. While performing well in terms +of accuracy, both the lack of interpretability and limitations to transfer +across languages limit their use in clinical setting. We introduce a novel +light-weight graph-based embedding method specifically catering radiology +reports. It takes into account the structure and composition of the report, +while also connecting medical terms in the report through the multi-lingual +SNOMED Clinical Terms knowledge base. The resulting graph embedding uncovers +the underlying relationships among clinical terms, achieving a representation +that is better understandable for clinicians and clinically more accurate, +without reliance on large pre-training datasets. We show the use of this +embedding on two tasks namely disease classification of X-ray reports and image +classification. For disease classification our model is competitive with its +BERT-based counterparts, while being magnitudes smaller in size and training +data requirements. For image classification, we show the effectiveness of the +graph embedding leveraging cross-modal knowledge transfer and show how this +method is usable across different languages. + +
+
+
+
+
+ + ♻ ☆ Probing in Context: Toward Building Robust Classifiers via Probing Large + Language Models + + +
+ Large language models are able to learn new tasks in context, where they are +provided with instructions and a few annotated examples. However, the +effectiveness of in-context learning is dependent on the provided context, and +the performance on a downstream task can vary considerably, depending on the +instruction. Importantly, such dependency on the context can surface in +unpredictable ways, e.g., a seemingly more informative instruction might lead +to a worse performance. In this paper, we propose an alternative approach, +which we term in-context probing. Similar to in-context learning, we +contextualize the representation of the input with an instruction, but instead +of decoding the output prediction, we probe the contextualized representation +to predict the label. Through a series of experiments on a diverse set of +classification tasks, we show that in-context probing is significantly more +robust to changes in instructions. We further show that probing performs +competitive or superior to finetuning and can be particularly helpful to build +classifiers on top of smaller models, and with only a hundred training +examples. + +
+
+
+
+
+ + ♻ ☆ Stochastic LLMs do not Understand Language: Towards Symbolic, + Explainable and Ontologically Based LLMs + + +
+ In our opinion the exuberance surrounding the relative success of data-driven +large language models (LLMs) is slightly misguided and for several reasons (i) +LLMs cannot be relied upon for factual information since for LLMs all ingested +text (factual or non-factual) was created equal; (ii) due to their subsymbolic +na-ture, whatever 'knowledge' these models acquire about language will always +be buried in billions of microfeatures (weights), none of which is meaningful +on its own; and (iii) LLMs will often fail to make the correct inferences in +several linguistic contexts (e.g., nominal compounds, copredication, quantifier +scope ambi-guities, intensional contexts. Since we believe the relative success +of data-driven large language models (LLMs) is not a reflection on the symbolic +vs. subsymbol-ic debate but a reflection on applying the successful strategy of +a bottom-up reverse engineering of language at scale, we suggest in this paper +applying the effective bottom-up strategy in a symbolic setting resulting in +symbolic, explainable, and ontologically grounded language models. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ Generative User-Experience Research for Developing Domain-specific + Natural Language Processing Applications + + +
+ User experience (UX) is a part of human-computer interaction (HCI) research +and focuses on increasing intuitiveness, transparency, simplicity, and trust +for system users. Most of the UX research for machine learning (ML) or natural +language processing (NLP) focuses on a data-driven methodology, i.e., it fails +to focus on users' requirements, and engages domain users mainly for usability +evaluation. Moreover, more typical UX methods tailor the systems towards user +usability, unlike learning about the user needs first. The paper proposes a +methodology for integrating generative UX research into developing domain NLP +applications. Generative UX research employs domain users at the initial stages +of prototype development, i.e., ideation and concept evaluation, and the last +stage for evaluating the change in user value. In the case study, we report the +full-cycle prototype development of a domain-specific semantic search for daily +operations in the process industry. Our case study shows that involving domain +experts increases their interest and trust in the final NLP application. +Moreover, we show that synergetic UX+NLP research efficiently considers data- +and user-driven opportunities and constraints, which can be crucial for NLP +applications in narrow domains + +
+
+
+
+
+ + ♻ ☆ Modern Baselines for SPARQL Semantic Parsing SIGIR 2022 + + +
+ In this work, we focus on the task of generating SPARQL queries from natural +language questions, which can then be executed on Knowledge Graphs (KGs). We +assume that gold entity and relations have been provided, and the remaining +task is to arrange them in the right order along with SPARQL vocabulary, and +input tokens to produce the correct SPARQL query. Pre-trained Language Models +(PLMs) have not been explored in depth on this task so far, so we experiment +with BART, T5 and PGNs (Pointer Generator Networks) with BERT embeddings, +looking for new baselines in the PLM era for this task, on DBpedia and Wikidata +KGs. We show that T5 requires special input tokenisation, but produces state of +the art performance on LC-QuAD 1.0 and LC-QuAD 2.0 datasets, and outperforms +task-specific models from previous works. Moreover, the methods enable semantic +parsing for questions where a part of the input needs to be copied to the +output query, thus enabling a new paradigm in KG semantic parsing. + +
+
+ comment: 5 pages, short paper, SIGIR 2022 +
+
+
+
+
+ + ♻ ☆ TIM: Teaching Large Language Models to Translate with Comparison + + +
+ Open-sourced large language models (LLMs) have demonstrated remarkable +efficacy in various tasks with instruction tuning. However, these models can +sometimes struggle with tasks that require more specialized knowledge such as +translation. One possible reason for such deficiency is that instruction tuning +aims to generate fluent and coherent text that continues from a given +instruction without being constrained by any task-specific requirements. +Moreover, it can be more challenging for tuning smaller LLMs with lower-quality +training data. To address this issue, we propose a novel framework using +examples in comparison to teach LLMs to learn translation. Our approach +involves presenting the model with examples of correct and incorrect +translations and using a preference loss to guide the model's learning. We +evaluate our method on WMT2022 test sets and show that it outperforms existing +methods. Our findings offer a new perspective on fine-tuning LLMs for +translation tasks and provide a promising solution for generating high-quality +translations. Please refer to Github for more details: +https://github.com/lemon0830/TIM. + +
+
+
+
+
+ + ♻ ☆ YORC: Yoruba Reading Comprehension dataset + + +
+ In this paper, we create YORC: a new multi-choice Yoruba Reading +Comprehension dataset that is based on Yoruba high-school reading comprehension +examination. We provide baseline results by performing cross-lingual transfer +using existing English RACE dataset based on a pre-trained encoder-only model. +Additionally, we provide results by prompting large language models (LLMs) like +GPT-4. + +
+
+
+
+
+ + ♻ ☆ LambdaKG: A Library for Pre-trained Language Model-Based Knowledge Graph + Embeddings AACL 2023 + + +
+ Knowledge Graphs (KGs) often have two characteristics: heterogeneous graph +structure and text-rich entity/relation information. Text-based KG embeddings +can represent entities by encoding descriptions with pre-trained language +models, but no open-sourced library is specifically designed for KGs with PLMs +at present. In this paper, we present LambdaKG, a library for KGE that equips +with many pre-trained language models (e.g., BERT, BART, T5, GPT-3), and +supports various tasks (e.g., knowledge graph completion, question answering, +recommendation, and knowledge probing). LambdaKG is publicly open-sourced at +https://github.com/zjunlp/PromptKG/tree/main/lambdaKG, with a demo video at +http://deepke.zjukg.cn/lambdakg.mp4 and long-term maintenance. + +
+
+ comment: AACL 2023 System Demonstrations, the project website is + https://zjunlp.github.io/project/promptkg/ +
+
+
+
+
+ + ♻ ☆ Identical and Fraternal Twins: Fine-Grained Semantic Contrastive + Learning of Sentence Representations ECAI2023 + + +
+ The enhancement of unsupervised learning of sentence representations has been +significantly achieved by the utility of contrastive learning. This approach +clusters the augmented positive instance with the anchor instance to create a +desired embedding space. However, relying solely on the contrastive objective +can result in sub-optimal outcomes due to its inability to differentiate subtle +semantic variations between positive pairs. Specifically, common data +augmentation techniques frequently introduce semantic distortion, leading to a +semantic margin between the positive pair. While the InfoNCE loss function +overlooks the semantic margin and prioritizes similarity maximization between +positive pairs during training, leading to the insensitive semantic +comprehension ability of the trained model. In this paper, we introduce a novel +Identical and Fraternal Twins of Contrastive Learning (named IFTCL) framework, +capable of simultaneously adapting to various positive pairs generated by +different augmentation techniques. We propose a \textit{Twins Loss} to preserve +the innate margin during training and promote the potential of data enhancement +in order to overcome the sub-optimal issue. We also present proof-of-concept +experiments combined with the contrastive objective to prove the validity of +the proposed Twins Loss. Furthermore, we propose a hippocampus queue mechanism +to restore and reuse the negative instances without additional calculation, +which further enhances the efficiency and performance of the IFCL. We verify +the IFCL framework on nine semantic textual similarity tasks with both English +and Chinese datasets, and the experimental results show that IFCL outperforms +state-of-the-art methods. + +
+
+ comment: This article has been accepted for publication in European Conference + on Artificial Intelligence (ECAI2023). 9 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ USA: Universal Sentiment Analysis Model & Construction of Japanese + Sentiment Text Classification and Part of Speech Dataset + + +
+ Sentiment analysis is a pivotal task in the domain of natural language +processing. It encompasses both text-level sentiment polarity classification +and word-level Part of Speech(POS) sentiment polarity determination. Such +analysis challenges models to understand text holistically while also +extracting nuanced information. With the rise of Large Language Models(LLMs), +new avenues for sentiment analysis have opened. This paper proposes enhancing +performance by leveraging the Mutual Reinforcement Effect(MRE) between +individual words and the overall text. It delves into how word polarity +influences the overarching sentiment of a passage. To support our research, we +annotated four novel Sentiment Text Classification and Part of Speech(SCPOS) +datasets, building upon existing sentiment classification datasets. +Furthermore, we developed a Universal Sentiment Analysis(USA) model, with a +7-billion parameter size. Experimental results revealed that our model +surpassed the performance of gpt-3.5-turbo across all four datasets, +underscoring the significance of MRE in sentiment analysis. + +
+
+ comment: Model already Open Sourced, Dataset will release soon +
+
+
+
+
+ + ♻ ☆ Reasoning with Language Model Prompting: A Survey ACL 2023 + + +
+ Reasoning, as an essential ability for complex problem-solving, can provide +back-end support for various real-world applications, such as medical +diagnosis, negotiation, etc. This paper provides a comprehensive survey of +cutting-edge research on reasoning with language model prompting. We introduce +research works with comparisons and summaries and provide systematic resources +to help beginners. We also discuss the potential reasons for emerging such +reasoning abilities and highlight future research directions. Resources are +available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated +periodically). + +
+
+ comment: ACL 2023, 24 pages, add references of theoretical analysis +
+
+
+
+
+ + ♻ ☆ MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised + Learning + + +
+ The first Multimodal Emotion Recognition Challenge (MER 2023) was +successfully held at ACM Multimedia. The challenge focuses on system robustness +and consists of three distinct tracks: (1) MER-MULTI, where participants are +required to recognize both discrete and dimensional emotions; (2) MER-NOISE, in +which noise is added to test videos for modality robustness evaluation; (3) +MER-SEMI, which provides a large amount of unlabeled samples for +semi-supervised learning. In this paper, we introduce the motivation behind +this challenge, describe the benchmark dataset, and provide some statistics +about participants. To continue using this dataset after MER 2023, please sign +a new End User License Agreement and send it to our official email address +merchallenge.contact@gmail.com. We believe this high-quality dataset can become +a new benchmark in multimodal emotion recognition, especially for the Chinese +research community. + +
+
+
+
+
+ + ♻ ☆ COVER: A Heuristic Greedy Adversarial Attack on Prompt-based Learning in + Language Models + + +
+ Prompt-based learning has been proved to be an effective way in pre-trained +language models (PLMs), especially in low-resource scenarios like few-shot +settings. However, the trustworthiness of PLMs is of paramount significance and +potential vulnerabilities have been shown in prompt-based templates that could +mislead the predictions of language models, causing serious security concerns. +In this paper, we will shed light on some vulnerabilities of PLMs, by proposing +a prompt-based adversarial attack on manual templates in black box scenarios. +First of all, we design character-level and word-level heuristic approaches to +break manual templates separately. Then we present a greedy algorithm for the +attack based on the above heuristic destructive approaches. Finally, we +evaluate our approach with the classification tasks on three variants of BERT +series models and eight datasets. And comprehensive experimental results +justify the effectiveness of our approach in terms of attack success rate and +attack speed. + +
+
+
+
+
+ + ♻ ☆ Overview of Robust and Multilingual Automatic Evaluation Metrics for + Open-Domain Dialogue Systems at DSTC 11 Track 4 + + +
+ The advent and fast development of neural networks have revolutionized the +research on dialogue systems and subsequently have triggered various challenges +regarding their automatic evaluation. Automatic evaluation of open-domain +dialogue systems as an open challenge has been the center of the attention of +many researchers. Despite the consistent efforts to improve automatic metrics' +correlations with human evaluation, there have been very few attempts to assess +their robustness over multiple domains and dimensions. Also, their focus is +mainly on the English language. All of these challenges prompt the development +of automatic evaluation metrics that are reliable in various domains, +dimensions, and languages. This track in the 11th Dialogue System Technology +Challenge (DSTC11) is part of the ongoing effort to promote robust and +multilingual automatic evaluation metrics. This article describes the datasets +and baselines provided to participants and discusses the submission and result +details of the two proposed subtasks. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 90 + +
+
+
+ + ☆ Large-Vocabulary 3D Diffusion Model with Transformer + + +
+ Creating diverse and high-quality 3D assets with an automatic generative +model is highly desirable. Despite extensive efforts on 3D generation, most +existing works focus on the generation of a single category or a few +categories. In this paper, we introduce a diffusion-based feed-forward +framework for synthesizing massive categories of real-world 3D objects with a +single generative model. Notably, there are three major challenges for this +large-vocabulary 3D generation: a) the need for expressive yet efficient 3D +representation; b) large diversity in geometry and texture across categories; +c) complexity in the appearances of real-world objects. To this end, we propose +a novel triplane-based 3D-aware Diffusion model with TransFormer, DiffTF, for +handling challenges via three aspects. 1) Considering efficiency and +robustness, we adopt a revised triplane representation and improve the fitting +speed and accuracy. 2) To handle the drastic variations in geometry and +texture, we regard the features of all 3D objects as a combination of +generalized 3D knowledge and specialized 3D features. To extract generalized 3D +knowledge from diverse categories, we propose a novel 3D-aware transformer with +shared cross-plane attention. It learns the cross-plane relations across +different planes and aggregates the generalized 3D knowledge with specialized +3D features. 3) In addition, we devise the 3D-aware encoder/decoder to enhance +the generalized 3D knowledge in the encoded triplanes for handling categories +with complex appearances. Extensive experiments on ShapeNet and OmniObject3D +(over 200 diverse real-world categories) convincingly demonstrate that a single +DiffTF model achieves state-of-the-art large-vocabulary 3D object generation +performance with large diversity, rich semantics, and high quality. + +
+
+ comment: Project page at https://ziangcao0312.github.io/difftf_pages/ +
+
+
+
+
+ + ☆ OpenIllumination: A Multi-Illumination Dataset for Inverse Rendering + Evaluation on Real Objects + + +
+ We introduce OpenIllumination, a real-world dataset containing over 108K +images of 64 objects with diverse materials, captured under 72 camera views and +a large number of different illuminations. For each image in the dataset, we +provide accurate camera parameters, illumination ground truth, and foreground +segmentation masks. Our dataset enables the quantitative evaluation of most +inverse rendering and material decomposition methods for real objects. We +examine several state-of-the-art inverse rendering methods on our dataset and +compare their performances. The dataset and code can be found on the project +page: https://oppo-us-research.github.io/OpenIllumination. + +
+
+
+
+
+ + ☆ Unified Human-Scene Interaction via Prompted Chain-of-Contacts + + +
+ Human-Scene Interaction (HSI) is a vital component of fields like embodied AI +and virtual reality. Despite advancements in motion quality and physical +plausibility, two pivotal factors, versatile interaction control and the +development of a user-friendly interface, require further exploration before +the practical application of HSI. This paper presents a unified HSI framework, +UniHSI, which supports unified control of diverse interactions through language +commands. This framework is built upon the definition of interaction as Chain +of Contacts (CoC): steps of human joint-object part pairs, which is inspired by +the strong correlation between interaction types and human-object contact +regions. Based on the definition, UniHSI constitutes a Large Language Model +(LLM) Planner to translate language prompts into task plans in the form of CoC, +and a Unified Controller that turns CoC into uniform task execution. To +facilitate training and evaluation, we collect a new dataset named ScenePlan +that encompasses thousands of task plans generated by LLMs based on diverse +scenarios. Comprehensive experiments demonstrate the effectiveness of our +framework in versatile task execution and generalizability to real scanned +scenes. The project page is at https://github.com/OpenRobotLab/UniHSI . + +
+
+ comment: A unified Human-Scene Interaction framework that supports versatile + interactions through language commands.Project URL: + https://github.com/OpenRobotLab/UniHSI .Please ignore the header of the paper +
+
+
+
+
+ + ☆ Looking at words and points with attention: a benchmark for + text-to-shape coherence ICCV 2023 + + +
+ While text-conditional 3D object generation and manipulation have seen rapid +progress, the evaluation of coherence between generated 3D shapes and input +textual descriptions lacks a clear benchmark. The reason is twofold: a) the low +quality of the textual descriptions in the only publicly available dataset of +text-shape pairs; b) the limited effectiveness of the metrics used to +quantitatively assess such coherence. In this paper, we propose a comprehensive +solution that addresses both weaknesses. Firstly, we employ large language +models to automatically refine textual descriptions associated with shapes. +Secondly, we propose a quantitative metric to assess text-to-shape coherence, +through cross-attention mechanisms. To validate our approach, we conduct a user +study and compare quantitatively our metric with existing ones. The refined +dataset, the new metric and a set of text-shape pairs validated by the user +study comprise a novel, fine-grained benchmark that we publicly release to +foster research on text-to-shape coherence of text-conditioned 3D generative +models. Benchmark available at +https://cvlab-unibo.github.io/CrossCoherence-Web/. + +
+
+ comment: ICCV 2023 Workshop "AI for 3D Content Creation", Project page: + https://cvlab-unibo.github.io/CrossCoherence-Web/, 26 pages +
+
+
+
+
+ + ☆ MMICL: Empowering Vision-language Model with Multi-Modal In-Context + Learning + + +
+ Starting from the resurgence of deep learning, vision-language models (VLMs) +benefiting from large language models (LLMs) have never been so popular. +However, while LLMs can utilize extensive background knowledge and task +information with in-context learning, most VLMs still struggle with +understanding complex multi-modal prompts with multiple images. The issue can +traced back to the architectural design of VLMs or pre-training data. +Specifically, the current VLMs primarily emphasize utilizing multi-modal data +with a single image some, rather than multi-modal prompts with interleaved +multiple images and text. Even though some newly proposed VLMs could handle +user prompts with multiple images, pre-training data does not provide more +sophisticated multi-modal prompts than interleaved image and text crawled from +the web. We propose MMICL to address the issue by considering both the model +and data perspectives. We introduce a well-designed architecture capable of +seamlessly integrating visual and textual context in an interleaved manner and +MIC dataset to reduce the gap between the training data and the complex user +prompts in real-world applications, including: 1) multi-modal context with +interleaved images and text, 2) textual references for each image, and 3) +multi-image data with spatial, logical, or temporal relationships. Our +experiments confirm that MMICL achieves new stat-of-the-art zero-shot and +few-shot performance on a wide range of general vision-language tasks, +especially for complex reasoning benchmarks including MME and MMBench. Our +analysis demonstrates that MMICL effectively deals with the challenge of +complex multi-modal prompt understanding. The experiments on ScienceQA-IMG also +show that MMICL successfully alleviates the issue of language bias in VLMs, +which we believe is the reason behind the advanced performance of MMICL. + +
+
+ comment: Code, dataset, checkpoints, and demos are available at + \href{https://github.com/HaozheZhao/MIC}{https://github.com/HaozheZhao/MIC} +
+
+
+
+
+ + ☆ ALWOD: Active Learning for Weakly-Supervised Object Detection ICCV 2023 + + +
+ Object detection (OD), a crucial vision task, remains challenged by the lack +of large training datasets with precise object localization labels. In this +work, we propose ALWOD, a new framework that addresses this problem by fusing +active learning (AL) with weakly and semi-supervised object detection +paradigms. Because the performance of AL critically depends on the model +initialization, we propose a new auxiliary image generator strategy that +utilizes an extremely small labeled set, coupled with a large weakly tagged set +of images, as a warm-start for AL. We then propose a new AL acquisition +function, another critical factor in AL success, that leverages the +student-teacher OD pair disagreement and uncertainty to effectively propose the +most informative images to annotate. Finally, to complete the AL loop, we +introduce a new labeling task delegated to human annotators, based on selection +and correction of model-proposed detections, which is both rapid and effective +in labeling the informative images. We demonstrate, across several challenging +benchmarks, that ALWOD significantly narrows the gap between the ODs trained on +few partially labeled but strategically selected image instances and those that +rely on the fully-labeled data. Our code is publicly available on +https://github.com/seqam-lab/ALWOD. + +
+
+ comment: published in ICCV 2023 +
+
+
+
+
+ + ☆ Disentangling Spatial and Temporal Learning for Efficient Image-to-Video + Transfer Learning ICCV2023 + + +
+ Recently, large-scale pre-trained language-image models like CLIP have shown +extraordinary capabilities for understanding spatial contents, but naively +transferring such models to video recognition still suffers from unsatisfactory +temporal modeling capabilities. Existing methods insert tunable structures into +or in parallel with the pre-trained model, which either requires +back-propagation through the whole pre-trained model and is thus +resource-demanding, or is limited by the temporal reasoning capability of the +pre-trained structure. In this work, we present DiST, which disentangles the +learning of spatial and temporal aspects of videos. Specifically, DiST uses a +dual-encoder structure, where a pre-trained foundation model acts as the +spatial encoder, and a lightweight network is introduced as the temporal +encoder. An integration branch is inserted between the encoders to fuse +spatio-temporal information. The disentangled spatial and temporal learning in +DiST is highly efficient because it avoids the back-propagation of massive +pre-trained parameters. Meanwhile, we empirically show that disentangled +learning with an extra network for integration benefits both spatial and +temporal understanding. Extensive experiments on five benchmarks show that DiST +delivers better performance than existing state-of-the-art methods by +convincing gaps. When pre-training on the large-scale Kinetics-710, we achieve +89.7% on Kinetics-400 with a frozen ViT-L model, which verifies the scalability +of DiST. Codes and models can be found in +https://github.com/alibaba-mmai-research/DiST. + +
+
+ comment: ICCV2023. Code: https://github.com/alibaba-mmai-research/DiST +
+
+
+
+
+ + ☆ TEMPO: Efficient Multi-View Pose Estimation, Tracking, and Forecasting ICCV 2023 + + +
+ Existing volumetric methods for predicting 3D human pose estimation are +accurate, but computationally expensive and optimized for single time-step +prediction. We present TEMPO, an efficient multi-view pose estimation model +that learns a robust spatiotemporal representation, improving pose accuracy +while also tracking and forecasting human pose. We significantly reduce +computation compared to the state-of-the-art by recurrently computing +per-person 2D pose features, fusing both spatial and temporal information into +a single representation. In doing so, our model is able to use spatiotemporal +context to predict more accurate human poses without sacrificing efficiency. We +further use this representation to track human poses over time as well as +predict future poses. Finally, we demonstrate that our model is able to +generalize across datasets without scene-specific fine-tuning. TEMPO achieves +10$\%$ better MPJPE with a 33$\times$ improvement in FPS compared to TesseTrack +on the challenging CMU Panoptic Studio dataset. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ☆ Physically Plausible Full-Body Hand-Object Interaction Synthesis + + +
+ We propose a physics-based method for synthesizing dexterous hand-object +interactions in a full-body setting. While recent advancements have addressed +specific facets of human-object interactions, a comprehensive physics-based +approach remains a challenge. Existing methods often focus on isolated segments +of the interaction process and rely on data-driven techniques that may result +in artifacts. In contrast, our proposed method embraces reinforcement learning +(RL) and physics simulation to mitigate the limitations of data-driven +approaches. Through a hierarchical framework, we first learn skill priors for +both body and hand movements in a decoupled setting. The generic skill priors +learn to decode a latent skill embedding into the motion of the underlying +part. A high-level policy then controls hand-object interactions in these +pretrained latent spaces, guided by task objectives of grasping and 3D target +trajectory following. It is trained using a novel reward function that combines +an adversarial style term with a task reward, encouraging natural motions while +fulfilling the task incentives. Our method successfully accomplishes the +complete interaction task, from approaching an object to grasping and +subsequent manipulation. We compare our approach against kinematics-based +baselines and show that it leads to more physically plausible motions. + +
+
+ comment: Project page at https://eth-ait.github.io/phys-fullbody-grasp +
+
+
+
+
+ + ☆ Generative Image Dynamics + + +
+ We present an approach to modeling an image-space prior on scene dynamics. +Our prior is learned from a collection of motion trajectories extracted from +real video sequences containing natural, oscillating motion such as trees, +flowers, candles, and clothes blowing in the wind. Given a single image, our +trained model uses a frequency-coordinated diffusion sampling process to +predict a per-pixel long-term motion representation in the Fourier domain, +which we call a neural stochastic motion texture. This representation can be +converted into dense motion trajectories that span an entire video. Along with +an image-based rendering module, these trajectories can be used for a number of +downstream applications, such as turning still images into seamlessly looping +dynamic videos, or allowing users to realistically interact with objects in +real pictures. + +
+
+ comment: Project website: http://generative-dynamics.github.io +
+
+
+
+
+ + ☆ HandNeRF: Learning to Reconstruct Hand-Object Interaction Scene from a + Single RGB Image + + +
+ This paper presents a method to learn hand-object interaction prior for +reconstructing a 3D hand-object scene from a single RGB image. The inference as +well as training-data generation for 3D hand-object scene reconstruction is +challenging due to the depth ambiguity of a single image and occlusions by the +hand and object. We turn this challenge into an opportunity by utilizing the +hand shape to constrain the possible relative configuration of the hand and +object geometry. We design a generalizable implicit function, HandNeRF, that +explicitly encodes the correlation of the 3D hand shape features and 2D object +features to predict the hand and object scene geometry. With experiments on +real-world datasets, we show that HandNeRF is able to reconstruct hand-object +scenes of novel grasp configurations more accurately than comparable methods. +Moreover, we demonstrate that object reconstruction from HandNeRF ensures more +accurate execution of a downstream task, such as grasping for robotic +hand-over. + +
+
+ comment: 9 pages, 4 tables, 7 figures +
+
+
+
+
+ + ☆ A Novel Local-Global Feature Fusion Framework for Body-weight Exercise + Recognition with Pressure Mapping Sensors + + +
+ We present a novel local-global feature fusion framework for body-weight +exercise recognition with floor-based dynamic pressure maps. One step further +from the existing studies using deep neural networks mainly focusing on global +feature extraction, the proposed framework aims to combine local and global +features using image processing techniques and the YOLO object detection to +localize pressure profiles from different body parts and consider physical +constraints. The proposed local feature extraction method generates two sets of +high-level local features consisting of cropped pressure mapping and numerical +features such as angular orientation, location on the mat, and pressure area. +In addition, we adopt a knowledge distillation for regularization to preserve +the knowledge of the global feature extraction and improve the performance of +the exercise recognition. Our experimental results demonstrate a notable 11 +percent improvement in F1 score for exercise recognition while preserving +label-specific features. + +
+
+
+
+
+ + ☆ mEBAL2 Database and Benchmark: Image-based Multispectral Eyeblink + Detection + + +
+ This work introduces a new multispectral database and novel approaches for +eyeblink detection in RGB and Near-Infrared (NIR) individual images. Our +contributed dataset (mEBAL2, multimodal Eye Blink and Attention Level +estimation, Version 2) is the largest existing eyeblink database, representing +a great opportunity to improve data-driven multispectral approaches for blink +detection and related applications (e.g., attention level estimation and +presentation attack detection in face biometrics). mEBAL2 includes 21,100 image +sequences from 180 different students (more than 2 million labeled images in +total) while conducting a number of e-learning tasks of varying difficulty or +taking a real course on HTML initiation through the edX MOOC platform. mEBAL2 +uses multiple sensors, including two Near-Infrared (NIR) and one RGB camera to +capture facial gestures during the execution of the tasks, as well as an +Electroencephalogram (EEG) band to get the cognitive activity of the user and +blinking events. Furthermore, this work proposes a Convolutional Neural Network +architecture as benchmark for blink detection on mEBAL2 with performances up to +97%. Different training methodologies are implemented using the RGB spectrum, +NIR spectrum, and the combination of both to enhance the performance on +existing eyeblink detectors. We demonstrate that combining NIR and RGB images +during training improves the performance of RGB eyeblink detectors (i.e., +detection based only on a RGB image). Finally, the generalization capacity of +the proposed eyeblink detectors is validated in wilder and more challenging +environments like the HUST-LEBW dataset to show the usefulness of mEBAL2 to +train a new generation of data-driven approaches for eyeblink detection. + +
+
+ comment: This paper is under consideration at Pattern Recognition Letters +
+
+
+
+
+ + ☆ Using network metrics to explore the community structure that underlies + movement patterns + + +
+ This work aims to explore the community structure of Santiago de Chile by +analyzing the movement patterns of its residents. We use a dataset containing +the approximate locations of home and work places for a subset of anonymized +residents to construct a network that represents the movement patterns within +the city. Through the analysis of this network, we aim to identify the +communities or sub-cities that exist within Santiago de Chile and gain insights +into the factors that drive the spatial organization of the city. We employ +modularity optimization algorithms and clustering techniques to identify the +communities within the network. Our results present that the novelty of +combining community detection algorithms with segregation tools provides new +insights to further the understanding of the complex geography of segregation +during working hours. + +
+
+ comment: 6 pages excluding References +
+
+
+
+
+ + ☆ Gradient constrained sharpness-aware prompt learning for vision-language + models + + +
+ This paper targets a novel trade-off problem in generalizable prompt learning +for vision-language models (VLM), i.e., improving the performance on unseen +classes while maintaining the performance on seen classes. Comparing with +existing generalizable methods that neglect the seen classes degradation, the +setting of this problem is more strict and fits more closely with practical +applications. To solve this problem, we start from the optimization +perspective, and leverage the relationship between loss landscape geometry and +model generalization ability. By analyzing the loss landscape of the +state-of-the-art method and the widely-used Sharpness-aware Minimization (SAM), +we conclude that the trade-off performance correlates to both loss value and +loss sharpness, while each of them are indispensable. However, we find the +optimizing gradient of existing methods cannot always maintain high consistency +with both loss value and loss sharpness during the whole optimization +procedure. To this end, we propose an novel SAM-based method for prompt +learning, denoted as Gradient Constrained Sharpness-aware Context Optimization +(GCSCoOp), to dynamically constrains the optimizing gradient, thus achieving +above two-fold optimization objective simultaneously. Extensive experiments +verify the effectiveness of GCSCoOp in the trade-off problem. + +
+
+ comment: 19 pages 11 figures +
+
+
+
+
+ + ☆ TFNet: Exploiting Temporal Cues for Fast and Accurate LiDAR Semantic + Segmentation + + +
+ LiDAR semantic segmentation plays a crucial role in enabling autonomous +driving and robots to understand their surroundings accurately and robustly. +There are different types of methods, such as point-based, range image-based, +and polar-based. Among these, range image-based methods are widely used due to +their balance between accuracy and speed. However, they face a significant +challenge known as the ``many-to-one'' problem caused by the range image's +limited horizontal and vertical angular resolution, where around 20% of the 3D +points are occluded during model inference based on our observation. In this +paper, we present TFNet, a range image-based LiDAR semantic segmentation method +that utilizes temporal information to address this issue. Specifically, we +incorporate a temporal fusion layer to extract useful information from previous +scans and integrate it with the current scan. We then design a max-voting-based +post-processing technique to correct false predictions, particularly those +caused by the ``many-to-one'' issue. Experiments on two benchmarks and seven +backbones of three modalities demonstrate the effectiveness and scalability of +our proposed method. + +
+
+
+
+
+ + ☆ MC-NeRF: Muti-Camera Neural Radiance Fields for Muti-Camera Image + Acquisition Systems + + +
+ Neural Radiance Fields (NeRF) employ multi-view images for 3D scene +representation and have shown remarkable performance. As one of the primary +sources of multi-view images, multi-camera systems encounter challenges such as +varying intrinsic parameters and frequent pose changes. Most previous +NeRF-based methods often assume a global unique camera and seldom consider +scenarios with multiple cameras. Besides, some pose-robust methods still remain +susceptible to suboptimal solutions when poses are poor initialized. In this +paper, we propose MC-NeRF, a method can jointly optimize both intrinsic and +extrinsic parameters for bundle-adjusting Neural Radiance Fields. Firstly, we +conduct a theoretical analysis to tackle the degenerate case and coupling issue +that arise from the joint optimization between intrinsic and extrinsic +parameters. Secondly, based on the proposed solutions, we introduce an +efficient calibration image acquisition scheme for multi-camera systems, +including the design of calibration object. Lastly, we present a global +end-to-end network with training sequence that enables the regression of +intrinsic and extrinsic parameters, along with the rendering network. Moreover, +most existing datasets are designed for unique camera, we create a new dataset +that includes four different styles of multi-camera acquisition systems, +allowing readers to generate custom datasets. Experiments confirm the +effectiveness of our method when each image corresponds to different camera +parameters. Specifically, we adopt up to 110 images with 110 different +intrinsic and extrinsic parameters, to achieve 3D scene representation without +providing initial poses. The Code and supplementary materials are available at +https://in2-viaun.github.io/MC-NeRF. + +
+
+ comment: This manuscript is currently under review +
+
+
+
+
+ + ☆ Large-scale Weakly Supervised Learning for Road Extraction from + Satellite Imagery + + +
+ Automatic road extraction from satellite imagery using deep learning is a +viable alternative to traditional manual mapping. Therefore it has received +considerable attention recently. However, most of the existing methods are +supervised and require pixel-level labeling, which is tedious and error-prone. +To make matters worse, the earth has a diverse range of terrain, vegetation, +and man-made objects. It is well known that models trained in one area +generalize poorly to other areas. Various shooting conditions such as light and +angel, as well as different image processing techniques further complicate the +issue. It is impractical to develop training data to cover all image styles. +This paper proposes to leverage OpenStreetMap road data as weak labels and +large scale satellite imagery to pre-train semantic segmentation models. Our +extensive experimental results show that the prediction accuracy increases with +the amount of the weakly labeled data, as well as the road density in the areas +chosen for training. Using as much as 100 times more data than the widely used +DeepGlobe road dataset, our model with the D-LinkNet architecture and the +ResNet-50 backbone exceeds the top performer of the current DeepGlobe +leaderboard. Furthermore, due to large-scale pre-training, our model +generalizes much better than those trained with only the curated datasets, +implying great application potential. + +
+
+
+
+
+ + ☆ Decomposition of linear tensor transformations + + +
+ One of the main issues in computing a tensor decomposition is how to choose +the number of rank-one components, since there is no finite algorithms for +determining the rank of a tensor. A commonly used approach for this purpose is +to find a low-dimensional subspace by solving an optimization problem and +assuming the number of components is fixed. However, even though this algorithm +is efficient and easy to implement, it often converges to poor local minima and +suffers from outliers and noise. The aim of this paper is to develop a +mathematical framework for exact tensor decomposition that is able to represent +a tensor as the sum of a finite number of low-rank tensors. In the paper three +different problems will be carried out to derive: i) the decomposition of a +non-negative self-adjoint tensor operator; ii) the decomposition of a linear +tensor transformation; iii) the decomposition of a generic tensor. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2305.02803 +
+
+
+
+
+ + ☆ What Matters to Enhance Traffic Rule Compliance of Imitation Learning + for Automated Driving + + +
+ More research attention has recently been given to end-to-end autonomous +driving technologies where the entire driving pipeline is replaced with a +single neural network because of its simpler structure and faster inference +time. Despite this appealing approach largely reducing the components in +driving pipeline, its simplicity also leads to interpretability problems and +safety issues arXiv:2003.06404. The trained policy is not always compliant with +the traffic rules and it is also hard to discover the reason for the +misbehavior because of the lack of intermediate outputs. Meanwhile, Sensors are +also critical to autonomous driving's security and feasibility to perceive the +surrounding environment under complex driving scenarios. In this paper, we +proposed P-CSG, a novel penalty-based imitation learning approach with cross +semantics generation sensor fusion technologies to increase the overall +performance of End-to-End Autonomous Driving. We conducted an assessment of our +model's performance using the Town 05 Long benchmark, achieving an impressive +driving score improvement of over 15%. Furthermore, we conducted robustness +evaluations against adversarial attacks like FGSM and Dot attacks, revealing a +substantial increase in robustness compared to baseline models.More detailed +information, such as code-based resources, ablation studies and videos can be +found at https://hk-zh.github.io/p-csg-plus. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ For A More Comprehensive Evaluation of 6DoF Object Pose Tracking + + +
+ Previous evaluations on 6DoF object pose tracking have presented obvious +limitations along with the development of this area. In particular, the +evaluation protocols are not unified for different methods, the widely-used +YCBV dataset contains significant annotation error, and the error metrics also +may be biased. As a result, it is hard to fairly compare the methods, which has +became a big obstacle for developing new algorithms. In this paper we +contribute a unified benchmark to address the above problems. For more accurate +annotation of YCBV, we propose a multi-view multi-object global pose refinement +method, which can jointly refine the poses of all objects and view cameras, +resulting in sub-pixel sub-millimeter alignment errors. The limitations of +previous scoring methods and error metrics are analyzed, based on which we +introduce our improved evaluation methods. The unified benchmark takes both +YCBV and BCOT as base datasets, which are shown to be complementary in scene +categories. In experiments, we validate the precision and reliability of the +proposed global pose refinement method with a realistic semi-synthesized +dataset particularly for YCBV, and then present the benchmark results unifying +learning&non-learning and RGB&RGBD methods, with some finds not discovered in +previous studies. + +
+
+
+
+
+ + ☆ Virchow: A Million-Slide Digital Pathology Foundation Model + + +
+ Computational pathology uses artificial intelligence to enable precision +medicine and decision support systems through the analysis of whole slide +images. It has the potential to revolutionize the diagnosis and treatment of +cancer. However, a major challenge to this objective is that for many specific +computational pathology tasks the amount of data is inadequate for development. +To address this challenge, we created Virchow, a 632 million parameter deep +neural network foundation model for computational pathology. Using +self-supervised learning, Virchow is trained on 1.5 million hematoxylin and +eosin stained whole slide images from diverse tissue groups, which is orders of +magnitude more data than previous works. When evaluated on downstream tasks +including tile-level pan-cancer detection and subtyping and slide-level +biomarker prediction, Virchow outperforms state-of-the-art systems both on +internal datasets drawn from the same population as the pretraining data as +well as external public datasets. Virchow achieves 93% balanced accuracy for +pancancer tile classification, and AUCs of 0.983 for colon microsatellite +instability status prediction and 0.967 for breast CDH1 status prediction. The +gains in performance highlight the importance of pretraining on massive +pathology image datasets, suggesting pretraining on even larger datasets could +continue improving performance for many high-impact applications where limited +amounts of training data are available, such as drug outcome prediction. + +
+
+
+
+
+ + ☆ PRE: Vision-Language Prompt Learning with Reparameterization Encoder + + +
+ Large pre-trained vision-language models such as CLIP have demonstrated great +potential in zero-shot transferability to downstream tasks. However, to attain +optimal performance, the manual selection of prompts is necessary to improve +alignment between the downstream image distribution and the textual class +descriptions. This manual prompt engineering is the major challenge for +deploying such models in practice since it requires domain expertise and is +extremely time-consuming. To avoid non-trivial prompt engineering, recent work +Context Optimization (CoOp) introduced the concept of prompt learning to the +vision domain using learnable textual tokens. While CoOp can achieve +substantial improvements over manual prompts, its learned context is worse +generalizable to wider unseen classes within the same dataset. In this work, we +present Prompt Learning with Reparameterization Encoder (PRE) - a simple and +efficient method that enhances the generalization ability of the learnable +prompt to unseen classes while maintaining the capacity to learn Base classes. +Instead of directly optimizing the prompts, PRE employs a prompt encoder to +reparameterize the input prompt embeddings, enhancing the exploration of +task-specific knowledge from few-shot samples. Experiments and extensive +ablation studies on 8 benchmarks demonstrate that our approach is an efficient +method for prompt learning. Specifically, PRE achieves a notable enhancement of +5.60% in average accuracy on New classes and 3% in Harmonic mean compared to +CoOp in the 16-shot setting, all achieved within a good training time. + +
+
+ comment: 8 pages excluding References and Appendix +
+
+
+
+
+ + ☆ Co-Salient Object Detection with Semantic-Level Consensus Extraction and + Dispersion ACM MM 2023 + + +
+ Given a group of images, co-salient object detection (CoSOD) aims to +highlight the common salient object in each image. There are two factors +closely related to the success of this task, namely consensus extraction, and +the dispersion of consensus to each image. Most previous works represent the +group consensus using local features, while we instead utilize a hierarchical +Transformer module for extracting semantic-level consensus. Therefore, it can +obtain a more comprehensive representation of the common object category, and +exclude interference from other objects that share local similarities with the +target object. In addition, we propose a Transformer-based dispersion module +that takes into account the variation of the co-salient object in different +scenes. It distributes the consensus to the image feature maps in an +image-specific way while making full use of interactions within the group. +These two modules are integrated with a ViT encoder and an FPN-like decoder to +form an end-to-end trainable network, without additional branch and auxiliary +loss. The proposed method is evaluated on three commonly used CoSOD datasets +and achieves state-of-the-art performance. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ DT-NeRF: Decomposed Triplane-Hash Neural Radiance Fields for + High-Fidelity Talking Portrait Synthesis ICASSP 2024 + + +
+ In this paper, we present the decomposed triplane-hash neural radiance fields +(DT-NeRF), a framework that significantly improves the photorealistic rendering +of talking faces and achieves state-of-the-art results on key evaluation +datasets. Our architecture decomposes the facial region into two specialized +triplanes: one specialized for representing the mouth, and the other for the +broader facial features. We introduce audio features as residual terms and +integrate them as query vectors into our model through an audio-mouth-face +transformer. Additionally, our method leverages the capabilities of Neural +Radiance Fields (NeRF) to enrich the volumetric representation of the entire +face through additive volumetric rendering techniques. Comprehensive +experimental evaluations corroborate the effectiveness and superiority of our +proposed approach. + +
+
+ comment: 5 pages, 5 figures. Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ OmnimatteRF: Robust Omnimatte with 3D Background Modeling ICCV 2023 + + +
+ Video matting has broad applications, from adding interesting effects to +casually captured movies to assisting video production professionals. Matting +with associated effects such as shadows and reflections has also attracted +increasing research activity, and methods like Omnimatte have been proposed to +separate dynamic foreground objects of interest into their own layers. However, +prior works represent video backgrounds as 2D image layers, limiting their +capacity to express more complicated scenes, thus hindering application to +real-world videos. In this paper, we propose a novel video matting method, +OmnimatteRF, that combines dynamic 2D foreground layers and a 3D background +model. The 2D layers preserve the details of the subjects, while the 3D +background robustly reconstructs scenes in real-world videos. Extensive +experiments demonstrate that our method reconstructs scenes with better quality +on various videos. + +
+
+ comment: ICCV 2023. Project page: https://omnimatte-rf.github.io/ +
+
+
+
+
+ + ☆ NutritionVerse: Empirical Study of Various Dietary Intake Estimation + Approaches + + +
+ Accurate dietary intake estimation is critical for informing policies and +programs to support healthy eating, as malnutrition has been directly linked to +decreased quality of life. However self-reporting methods such as food diaries +suffer from substantial bias. Other conventional dietary assessment techniques +and emerging alternative approaches such as mobile applications incur high time +costs and may necessitate trained personnel. Recent work has focused on using +computer vision and machine learning to automatically estimate dietary intake +from food images, but the lack of comprehensive datasets with diverse +viewpoints, modalities and food annotations hinders the accuracy and realism of +such methods. To address this limitation, we introduce NutritionVerse-Synth, +the first large-scale dataset of 84,984 photorealistic synthetic 2D food images +with associated dietary information and multimodal annotations (including depth +images, instance masks, and semantic masks). Additionally, we collect a real +image dataset, NutritionVerse-Real, containing 889 images of 251 dishes to +evaluate realism. Leveraging these novel datasets, we develop and benchmark +NutritionVerse, an empirical study of various dietary intake estimation +approaches, including indirect segmentation-based and direct prediction +networks. We further fine-tune models pretrained on synthetic data with real +images to provide insights into the fusion of synthetic and real data. Finally, +we release both datasets (NutritionVerse-Synth, NutritionVerse-Real) on +https://www.kaggle.com/nutritionverse/datasets as part of an open initiative to +accelerate machine learning for dietary sensing. + +
+
+
+
+
+ + ☆ Dataset Condensation via Generative Model + + +
+ Dataset condensation aims to condense a large dataset with a lot of training +samples into a small set. Previous methods usually condense the dataset into +the pixels format. However, it suffers from slow optimization speed and large +number of parameters to be optimized. When increasing image resolutions and +classes, the number of learnable parameters grows accordingly, prohibiting +condensation methods from scaling up to large datasets with diverse classes. +Moreover, the relations among condensed samples have been neglected and hence +the feature distribution of condensed samples is often not diverse. To solve +these problems, we propose to condense the dataset into another format, a +generative model. Such a novel format allows for the condensation of large +datasets because the size of the generative model remains relatively stable as +the number of classes or image resolution increases. Furthermore, an +intra-class and an inter-class loss are proposed to model the relation of +condensed samples. Intra-class loss aims to create more diverse samples for +each class by pushing each sample away from the others of the same class. +Meanwhile, inter-class loss increases the discriminability of samples by +widening the gap between the centers of different classes. Extensive +comparisons with state-of-the-art methods and our ablation studies confirm the +effectiveness of our method and its individual component. To our best +knowledge, we are the first to successfully conduct condensation on +ImageNet-1k. + +
+
+ comment: old work,done in 2022 +
+
+
+
+
+ + ☆ CoRF : Colorizing Radiance Fields using Knowledge Distillation ICCV 2023 + + +
+ Neural radiance field (NeRF) based methods enable high-quality novel-view +synthesis for multi-view images. This work presents a method for synthesizing +colorized novel views from input grey-scale multi-view images. When we apply +image or video-based colorization methods on the generated grey-scale novel +views, we observe artifacts due to inconsistency across views. Training a +radiance field network on the colorized grey-scale image sequence also does not +solve the 3D consistency issue. We propose a distillation based method to +transfer color knowledge from the colorization networks trained on natural +images to the radiance field network. Specifically, our method uses the +radiance field network as a 3D representation and transfers knowledge from +existing 2D colorization methods. The experimental results demonstrate that the +proposed method produces superior colorized novel views for indoor and outdoor +scenes while maintaining cross-view consistency than baselines. Further, we +show the efficacy of our method on applications like colorization of radiance +field network trained from 1.) Infra-Red (IR) multi-view images and 2.) Old +grey-scale multi-view image sequences. + +
+
+ comment: AI3DCC @ ICCV 2023 +
+
+
+
+
+ + ☆ Towards Robust and Unconstrained Full Range of Rotation Head Pose + Estimation + + +
+ Estimating the head pose of a person is a crucial problem for numerous +applications that is yet mainly addressed as a subtask of frontal pose +prediction. We present a novel method for unconstrained end-to-end head pose +estimation to tackle the challenging task of full range of orientation head +pose prediction. We address the issue of ambiguous rotation labels by +introducing the rotation matrix formalism for our ground truth data and propose +a continuous 6D rotation matrix representation for efficient and robust direct +regression. This allows to efficiently learn full rotation appearance and to +overcome the limitations of the current state-of-the-art. Together with new +accumulated training data that provides full head pose rotation data and a +geodesic loss approach for stable learning, we design an advanced model that is +able to predict an extended range of head orientations. An extensive evaluation +on public datasets demonstrates that our method significantly outperforms other +state-of-the-art methods in an efficient and robust manner, while its advanced +prediction range allows the expansion of the application area. We open-source +our training and testing code along with our trained models: +https://github.com/thohemp/6DRepNet360. + +
+
+
+
+
+ + ☆ Indoor Scene Reconstruction with Fine-Grained Details Using Hybrid + Representation and Normal Prior Enhancement + + +
+ The reconstruction of indoor scenes from multi-view RGB images is challenging +due to the coexistence of flat and texture-less regions alongside delicate and +fine-grained regions. Recent methods leverage neural radiance fields aided by +predicted surface normal priors to recover the scene geometry. These methods +excel in producing complete and smooth results for floor and wall areas. +However, they struggle to capture complex surfaces with high-frequency +structures due to the inadequate neural representation and the inaccurately +predicted normal priors. To improve the capacity of the implicit +representation, we propose a hybrid architecture to represent low-frequency and +high-frequency regions separately. To enhance the normal priors, we introduce a +simple yet effective image sharpening and denoising technique, coupled with a +network that estimates the pixel-wise uncertainty of the predicted surface +normal vectors. Identifying such uncertainty can prevent our model from being +misled by unreliable surface normal supervisions that hinder the accurate +reconstruction of intricate geometries. Experiments on the benchmark datasets +show that our method significantly outperforms existing methods in terms of +reconstruction quality. + +
+
+
+
+
+ + ☆ SwitchGPT: Adapting Large Language Models for Non-Text Outputs + + +
+ Large Language Models (LLMs), primarily trained on text-based datasets, +exhibit exceptional proficiencies in understanding and executing complex +linguistic instructions via text outputs. However, they falter when requests to +generate non-text ones. Concurrently, modality conversion models, such as +text-to-image, despite generating high-quality images, suffer from a lack of +extensive textual pretraining. As a result, these models are only capable of +accommodating specific image descriptions rather than comprehending more +complex instructions. To bridge this gap, we propose a novel approach, +\methodname, from a modality conversion perspective that evolves a text-based +LLM into a multi-modal one. We specifically employ a minimal dataset to +instruct LLMs to recognize the intended output modality as directed by the +instructions. Consequently, the adapted LLM can effectively summon various +off-the-shelf modality conversion models from the model zoos to generate +non-text responses. This circumvents the necessity for complicated pretraining +that typically requires immense quantities of paired multi-modal data, while +simultaneously inheriting the extensive knowledge of LLMs and the ability of +high-quality generative models. To evaluate and compare the adapted multi-modal +LLM with its traditional counterparts, we have constructed a multi-modal +instruction benchmark that solicits diverse modality outputs. The experiment +results reveal that, with minimal training, LLMs can be conveniently adapted to +comprehend requests for non-text responses, thus achieving higher flexibility +in multi-modal scenarios. Code and data will be made available at +https://github.com/xinke-wang/SwitchGPT. + +
+
+
+
+
+ + ☆ Road Disease Detection based on Latent Domain Background Feature + Separation and Suppression + + +
+ Road disease detection is challenging due to the the small proportion of road +damage in target region and the diverse background,which introduce lots of +domain information.Besides, disease categories have high similarity,makes the +detection more difficult. In this paper, we propose a new LDBFSS(Latent Domain +Background Feature Separation and Suppression) network which could perform +background information separation and suppression without domain supervision +and contrastive enhancement of object features.We combine our LDBFSS network +with YOLOv5 model to enhance disease features for better road disease +detection. As the components of LDBFSS network, we first design a latent domain +discovery module and a domain adversarial learning module to obtain pseudo +domain labels through unsupervised method, guiding domain discriminator and +model to train adversarially to suppress background information. In addition, +we introduce a contrastive learning module and design k-instance contrastive +loss, optimize the disease feature representation by increasing the inter-class +distance and reducing the intra-class distance for object features. We +conducted experiments on two road disease detection datasets, GRDDC and CNRDD, +and compared with other models,which show an increase of nearly 4% on GRDDC +dataset compared with optimal model, and an increase of 4.6% on CNRDD dataset. +Experimental results prove the effectiveness and superiority of our model. + +
+
+
+
+
+ + ☆ Learning Quasi-Static 3D Models of Markerless Deformable Linear Objects + for Bimanual Robotic Manipulation + + +
+ The robotic manipulation of Deformable Linear Objects (DLOs) is a vital and +challenging task that is important in many practical applications. Classical +model-based approaches to this problem require an accurate model to capture how +robot motions affect the deformation of the DLO. Nowadays, data-driven models +offer the best tradeoff between quality and computation time. This paper +analyzes several learning-based 3D models of the DLO and proposes a new one +based on the Transformer architecture that achieves superior accuracy, even on +the DLOs of different lengths, thanks to the proposed scaling method. Moreover, +we introduce a data augmentation technique, which improves the prediction +performance of almost all considered DLO data-driven models. Thanks to this +technique, even a simple Multilayer Perceptron (MLP) achieves close to +state-of-the-art performance while being significantly faster to evaluate. In +the experiments, we compare the performance of the learning-based 3D models of +the DLO on several challenging datasets quantitatively and demonstrate their +applicability in the task of shaping a DLO. + +
+
+ comment: Under review for IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Universality of underlying mechanism for successful deep learning + + +
+ An underlying mechanism for successful deep learning (DL) with a limited deep +architecture and dataset, namely VGG-16 on CIFAR-10, was recently presented +based on a quantitative method to measure the quality of a single filter in +each layer. In this method, each filter identifies small clusters of possible +output labels, with additional noise selected as labels out of the clusters. +This feature is progressively sharpened with the layers, resulting in an +enhanced signal-to-noise ratio (SNR) and higher accuracy. In this study, the +suggested universal mechanism is verified for VGG-16 and EfficientNet-B0 +trained on the CIFAR-100 and ImageNet datasets with the following main results. +First, the accuracy progressively increases with the layers, whereas the noise +per filter typically progressively decreases. Second, for a given deep +architecture, the maximal error rate increases approximately linearly with the +number of output labels. Third, the average filter cluster size and the number +of clusters per filter at the last convolutional layer adjacent to the output +layer are almost independent of the number of dataset labels in the range [3, +1,000], while a high SNR is preserved. The presented DL mechanism suggests +several techniques, such as applying filter's cluster connections (AFCC), to +improve the computational complexity and accuracy of deep architectures and +furthermore pinpoints the simplification of pre-existing structures while +maintaining their accuracies. + +
+
+ comment: 27 pages,5 figures, 6 tables. arXiv admin note: text overlap with + arXiv:2305.18078 +
+
+
+
+
+ + ☆ A Multi-scale Generalized Shrinkage Threshold Network for Image Blind + Deblurring in Remote Sensing + + +
+ Remote sensing images are essential for many earth science applications, but +their quality can be degraded due to limitations in sensor technology and +complex imaging environments. To address this, various remote sensing image +deblurring methods have been developed to restore sharp, high-quality images +from degraded observational data. However, most traditional model-based +deblurring methods usually require predefined hand-craft prior assumptions, +which are difficult to handle in complex applications, and most deep +learning-based deblurring methods are designed as a black box, lacking +transparency and interpretability. In this work, we propose a novel blind +deblurring learning framework based on alternating iterations of shrinkage +thresholds, alternately updating blurring kernels and images, with the +theoretical foundation of network design. Additionally, we propose a learnable +blur kernel proximal mapping module to improve the blur kernel evaluation in +the kernel domain. Then, we proposed a deep proximal mapping module in the +image domain, which combines a generalized shrinkage threshold operator and a +multi-scale prior feature extraction block. This module also introduces an +attention mechanism to adaptively adjust the prior importance, thus avoiding +the drawbacks of hand-crafted image prior terms. Thus, a novel multi-scale +generalized shrinkage threshold network (MGSTNet) is designed to specifically +focus on learning deep geometric prior features to enhance image restoration. +Experiments demonstrate the superiority of our MGSTNet framework on remote +sensing image datasets compared to existing deblurring methods. + +
+
+ comment: 12 pages, +
+
+
+
+
+ + ☆ Dhan-Shomadhan: A Dataset of Rice Leaf Disease Classification for + Bangladeshi Local Rice + + +
+ This dataset represents almost all the harmful diseases for rice in +Bangladesh. This dataset consists of 1106 image of five harmful diseases called +Brown Spot, Leaf Scaled, Rice Blast, Rice Turngo, Steath Blight in two +different background variation named field background picture and white +background picture. Two different background variation helps the dataset to +perform more accurately so that the user can use this data for field use as +well as white background for decision making. The data is collected from rice +field of Dhaka Division. This dataset can use for rice leaf diseases +classification, diseases detection using Computer Vision and Pattern +Recognition for different rice leaf disease. + +
+
+
+
+
+ + ☆ RecycleNet: Latent Feature Recycling Leads to Iterative Decision + Refinement WACV + + +
+ Despite the remarkable success of deep learning systems over the last decade, +a key difference still remains between neural network and human +decision-making: As humans, we cannot only form a decision on the spot, but +also ponder, revisiting an initial guess from different angles, distilling +relevant information, arriving at a better decision. Here, we propose +RecycleNet, a latent feature recycling method, instilling the pondering +capability for neural networks to refine initial decisions over a number of +recycling steps, where outputs are fed back into earlier network layers in an +iterative fashion. This approach makes minimal assumptions about the neural +network architecture and thus can be implemented in a wide variety of contexts. +Using medical image segmentation as the evaluation environment, we show that +latent feature recycling enables the network to iteratively refine initial +predictions even beyond the iterations seen during training, converging towards +an improved decision. We evaluate this across a variety of segmentation +benchmarks and show consistent improvements even compared with top-performing +segmentation methods. This allows trading increased computation time for +improved performance, which can be beneficial, especially for safety-critical +applications. + +
+
+ comment: Accepted at 2024 Winter Conference on Applications of Computer Vision + (WACV) +
+
+
+
+
+ + ☆ Learning Environment-Aware Affordance for 3D Articulated Object + Manipulation under Occlusions + + +
+ Perceiving and manipulating 3D articulated objects in diverse environments is +essential for home-assistant robots. Recent studies have shown that point-level +affordance provides actionable priors for downstream manipulation tasks. +However, existing works primarily focus on single-object scenarios with +homogeneous agents, overlooking the realistic constraints imposed by the +environment and the agent's morphology, e.g., occlusions and physical +limitations. In this paper, we propose an environment-aware affordance +framework that incorporates both object-level actionable priors and environment +constraints. Unlike object-centric affordance approaches, learning +environment-aware affordance faces the challenge of combinatorial explosion due +to the complexity of various occlusions, characterized by their quantities, +geometries, positions and poses. To address this and enhance data efficiency, +we introduce a novel contrastive affordance learning framework capable of +training on scenes containing a single occluder and generalizing to scenes with +complex occluder combinations. Experiments demonstrate the effectiveness of our +proposed approach in learning affordance considering environment constraints. + +
+
+
+
+
+ + ☆ DiffTalker: Co-driven audio-image diffusion for talking faces via + intermediate landmarks ICASSP 2024 + + +
+ Generating realistic talking faces is a complex and widely discussed task +with numerous applications. In this paper, we present DiffTalker, a novel model +designed to generate lifelike talking faces through audio and landmark +co-driving. DiffTalker addresses the challenges associated with directly +applying diffusion models to audio control, which are traditionally trained on +text-image pairs. DiffTalker consists of two agent networks: a +transformer-based landmarks completion network for geometric accuracy and a +diffusion-based face generation network for texture details. Landmarks play a +pivotal role in establishing a seamless connection between the audio and image +domains, facilitating the incorporation of knowledge from pre-trained diffusion +models. This innovative approach efficiently produces articulate-speaking +faces. Experimental results showcase DiffTalker's superior performance in +producing clear and geometrically accurate talking faces, all without the need +for additional alignment between audio and image features. + +
+
+ comment: submmit to ICASSP 2024 +
+
+
+
+
+ + ☆ Efficiently Robustify Pre-trained Models + + +
+ A recent trend in deep learning algorithms has been towards training large +scale models, having high parameter count and trained on big dataset. However, +robustness of such large scale models towards real-world settings is still a +less-explored topic. In this work, we first benchmark the performance of these +models under different perturbations and datasets thereby representing +real-world shifts, and highlight their degrading performance under these +shifts. We then discuss on how complete model fine-tuning based existing +robustification schemes might not be a scalable option given very large scale +networks and can also lead them to forget some of the desired characterstics. +Finally, we propose a simple and cost-effective method to solve this problem, +inspired by knowledge transfer literature. It involves robustifying smaller +models, at a lower computation cost, and then use them as teachers to tune a +fraction of these large scale networks, reducing the overall computational +overhead. We evaluate our proposed method under various vision perturbations +including ImageNet-C,R,S,A datasets and also for transfer learning, zero-shot +evaluation setups on different datasets. Benchmark results show that our method +is able to induce robustness to these large scale models efficiently, requiring +significantly lower time and also preserves the transfer learning, zero-shot +properties of the original model which none of the existing methods are able to +achieve. + +
+
+
+
+
+ + ☆ HDTR-Net: A Real-Time High-Definition Teeth Restoration Network for + Arbitrary Talking Face Generation Methods + + +
+ Talking Face Generation (TFG) aims to reconstruct facial movements to achieve +high natural lip movements from audio and facial features that are under +potential connections. Existing TFG methods have made significant advancements +to produce natural and realistic images. However, most work rarely takes visual +quality into consideration. It is challenging to ensure lip synchronization +while avoiding visual quality degradation in cross-modal generation methods. To +address this issue, we propose a universal High-Definition Teeth Restoration +Network, dubbed HDTR-Net, for arbitrary TFG methods. HDTR-Net can enhance teeth +regions at an extremely fast speed while maintaining synchronization, and +temporal consistency. In particular, we propose a Fine-Grained Feature Fusion +(FGFF) module to effectively capture fine texture feature information around +teeth and surrounding regions, and use these features to fine-grain the feature +map to enhance the clarity of teeth. Extensive experiments show that our method +can be adapted to arbitrary TFG methods without suffering from lip +synchronization and frame coherence. Another advantage of HDTR-Net is its +real-time generation ability. Also under the condition of high-definition +restoration of talking face video synthesis, its inference speed is $300\%$ +faster than the current state-of-the-art face restoration based on +super-resolution. + +
+
+ comment: 15pages, 6 figures, PRCV2023 +
+
+
+
+
+ + ☆ EP2P-Loc: End-to-End 3D Point to 2D Pixel Localization for Large-Scale + Visual Localization ICCV 2023 + + +
+ Visual localization is the task of estimating a 6-DoF camera pose of a query +image within a provided 3D reference map. Thanks to recent advances in various +3D sensors, 3D point clouds are becoming a more accurate and affordable option +for building the reference map, but research to match the points of 3D point +clouds with pixels in 2D images for visual localization remains challenging. +Existing approaches that jointly learn 2D-3D feature matching suffer from low +inliers due to representational differences between the two modalities, and the +methods that bypass this problem into classification have an issue of poor +refinement. In this work, we propose EP2P-Loc, a novel large-scale visual +localization method that mitigates such appearance discrepancy and enables +end-to-end training for pose estimation. To increase the number of inliers, we +propose a simple algorithm to remove invisible 3D points in the image, and find +all 2D-3D correspondences without keypoint detection. To reduce memory usage +and search complexity, we take a coarse-to-fine approach where we extract +patch-level features from 2D images, then perform 2D patch classification on +each 3D point, and obtain the exact corresponding 2D pixel coordinates through +positional encoding. Finally, for the first time in this task, we employ a +differentiable PnP for end-to-end training. In the experiments on newly curated +large-scale indoor and outdoor benchmarks based on 2D-3D-S and KITTI, we show +that our method achieves the state-of-the-art performance compared to existing +visual localization and image-to-point cloud registration methods. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ☆ Detecting Unknown Attacks in IoT Environments: An Open Set Classifier + for Enhanced Network Intrusion Detection + + +
+ The widespread integration of Internet of Things (IoT) devices across all +facets of life has ushered in an era of interconnectedness, creating new +avenues for cybersecurity challenges and underscoring the need for robust +intrusion detection systems. However, traditional security systems are designed +with a closed-world perspective and often face challenges in dealing with the +ever-evolving threat landscape, where new and unfamiliar attacks are constantly +emerging. In this paper, we introduce a framework aimed at mitigating the open +set recognition (OSR) problem in the realm of Network Intrusion Detection +Systems (NIDS) tailored for IoT environments. Our framework capitalizes on +image-based representations of packet-level data, extracting spatial and +temporal patterns from network traffic. Additionally, we integrate stacking and +sub-clustering techniques, enabling the identification of unknown attacks by +effectively modeling the complex and diverse nature of benign behavior. The +empirical results prominently underscore the framework's efficacy, boasting an +impressive 88\% detection rate for previously unseen attacks when compared +against existing approaches and recent advancements. Future work will perform +extensive experimentation across various openness levels and attack scenarios, +further strengthening the adaptability and performance of our proposed solution +in safeguarding IoT environments. + +
+
+ comment: 6 Pages, 5 figures +
+
+
+
+
+ + ☆ Research on self-cross transformer model of point cloud change detecter + + +
+ With the vigorous development of the urban construction industry, engineering +deformation or changes often occur during the construction process. To combat +this phenomenon, it is necessary to detect changes in order to detect +construction loopholes in time, ensure the integrity of the project and reduce +labor costs. Or the inconvenience and injuriousness of the road. In the study +of change detection in 3D point clouds, researchers have published various +research methods on 3D point clouds. Directly based on but mostly based +ontraditional threshold distance methods (C2C, M3C2, M3C2-EP), and some are to +convert 3D point clouds into DSM, which loses a lot of original information. +Although deep learning is used in remote sensing methods, in terms of change +detection of 3D point clouds, it is more converted into two-dimensional +patches, and neural networks are rarely applied directly. We prefer that the +network is given at the level of pixels or points. Variety. Therefore, in this +article, our network builds a network for 3D point cloud change detection, and +proposes a new module Cross transformer suitable for change detection. +Simultaneously simulate tunneling data for change detection, and do test +experiments with our network. + +
+
+
+
+
+ + ☆ DePT: Decoupled Prompt Tuning + + +
+ This work breaks through the Base-New Tradeoff (BNT)dilemma in prompt tuning, +i.e., the better the tuned model generalizes to the base (or target) task, the +worse it generalizes to new tasks, and vice versa. Specifically, through an +in-depth analysis of the learned features of the base and new tasks, we observe +that the BNT stems from a channel bias issue, i.e., the vast majority of +feature channels are occupied by base-specific knowledge, resulting in the +collapse of taskshared knowledge important to new tasks. To address this, we +propose the Decoupled Prompt Tuning (DePT) framework, which decouples +base-specific knowledge from feature channels into an isolated feature space +during prompt tuning, so as to maximally preserve task-shared knowledge in the +original feature space for achieving better zero-shot generalization on new +tasks. Importantly, our DePT is orthogonal to existing prompt tuning methods, +hence it can improve all of them. Extensive experiments on 11 datasets show the +strong flexibility and effectiveness of DePT. Our code and pretrained models +are available at https://github.com/Koorye/DePT. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Physical Invisible Backdoor Based on Camera Imaging + + +
+ Backdoor attack aims to compromise a model, which returns an adversary-wanted +output when a specific trigger pattern appears yet behaves normally for clean +inputs. Current backdoor attacks require changing pixels of clean images, which +results in poor stealthiness of attacks and increases the difficulty of the +physical implementation. This paper proposes a novel physical invisible +backdoor based on camera imaging without changing nature image pixels. +Specifically, a compromised model returns a target label for images taken by a +particular camera, while it returns correct results for other images. To +implement and evaluate the proposed backdoor, we take shots of different +objects from multi-angles using multiple smartphones to build a new dataset of +21,500 images. Conventional backdoor attacks work ineffectively with some +classical models, such as ResNet18, over the above-mentioned dataset. +Therefore, we propose a three-step training strategy to mount the backdoor +attack. First, we design and train a camera identification model with the phone +IDs to extract the camera fingerprint feature. Subsequently, we elaborate a +special network architecture, which is easily compromised by our backdoor +attack, by leveraging the attributes of the CFA interpolation algorithm and +combining it with the feature extraction block in the camera identification +model. Finally, we transfer the backdoor from the elaborated special network +architecture to the classical architecture model via teacher-student +distillation learning. Since the trigger of our method is related to the +specific phone, our attack works effectively in the physical world. Experiment +results demonstrate the feasibility of our proposed approach and robustness +against various backdoor defenses. + +
+
+
+
+
+ + ☆ JSMNet Improving Indoor Point Cloud Semantic and Instance Segmentation + through Self-Attention and Multiscale + + +
+ The semantic understanding of indoor 3D point cloud data is crucial for a +range of subsequent applications, including indoor service robots, navigation +systems, and digital twin engineering. Global features are crucial for +achieving high-quality semantic and instance segmentation of indoor point +clouds, as they provide essential long-range context information. To this end, +we propose JSMNet, which combines a multi-layer network with a global feature +self-attention module to jointly segment three-dimensional point cloud +semantics and instances. To better express the characteristics of indoor +targets, we have designed a multi-resolution feature adaptive fusion module +that takes into account the differences in point cloud density caused by +varying scanner distances from the target. Additionally, we propose a framework +for joint semantic and instance segmentation by integrating semantic and +instance features to achieve superior results. We conduct experiments on S3DIS, +which is a large three-dimensional indoor point cloud dataset. Our proposed +method is compared against other methods, and the results show that it +outperforms existing methods in semantic and instance segmentation and provides +better results in target local area segmentation. Specifically, our proposed +method outperforms PointNet (Qi et al., 2017a) by 16.0% and 26.3% in terms of +semantic segmentation mIoU in S3DIS (Area 5) and instance segmentation mPre, +respectively. Additionally, it surpasses ASIS (Wang et al., 2019) by 6.0% and +4.6%, respectively, as well as JSPNet (Chen et al., 2022) by a margin of 3.3% +for semantic segmentation mIoU and a slight improvement of 0.3% for instance +segmentation mPre. + +
+
+
+
+
+ + ☆ Masked Diffusion with Task-awareness for Procedure Planning in + Instructional Videos + + +
+ A key challenge with procedure planning in instructional videos lies in how +to handle a large decision space consisting of a multitude of action types that +belong to various tasks. To understand real-world video content, an AI agent +must proficiently discern these action types (e.g., pour milk, pour water, open +lid, close lid, etc.) based on brief visual observation. Moreover, it must +adeptly capture the intricate semantic relation of the action types and task +goals, along with the variable action sequences. Recently, notable progress has +been made via the integration of diffusion models and visual representation +learning to address the challenge. However, existing models employ rudimentary +mechanisms to utilize task information to manage the decision space. To +overcome this limitation, we introduce a simple yet effective enhancement - a +masked diffusion model. The introduced mask acts akin to a task-oriented +attention filter, enabling the diffusion/denoising process to concentrate on a +subset of action types. Furthermore, to bolster the accuracy of task +classification, we harness more potent visual representation learning +techniques. In particular, we learn a joint visual-text embedding, where a text +embedding is generated by prompting a pre-trained vision-language model to +focus on human actions. We evaluate the method on three public datasets and +achieve state-of-the-art performance on multiple metrics. Code is available at +https://github.com/ffzzy840304/Masked-PDPP. + +
+
+ comment: 7 pages (main text excluding references), 3 figures, 7 tables +
+
+
+
+
+ + ☆ Flexible Visual Recognition by Evidential Modeling of Confusion and + Ignorance ICCV23 + + +
+ In real-world scenarios, typical visual recognition systems could fail under +two major causes, i.e., the misclassification between known classes and the +excusable misbehavior on unknown-class images. To tackle these deficiencies, +flexible visual recognition should dynamically predict multiple classes when +they are unconfident between choices and reject making predictions when the +input is entirely out of the training distribution. Two challenges emerge along +with this novel task. First, prediction uncertainty should be separately +quantified as confusion depicting inter-class uncertainties and ignorance +identifying out-of-distribution samples. Second, both confusion and ignorance +should be comparable between samples to enable effective decision-making. In +this paper, we propose to model these two sources of uncertainty explicitly +with the theory of Subjective Logic. Regarding recognition as an +evidence-collecting process, confusion is then defined as conflicting evidence, +while ignorance is the absence of evidence. By predicting Dirichlet +concentration parameters for singletons, comprehensive subjective opinions, +including confusion and ignorance, could be achieved via further evidence +combinations. Through a series of experiments on synthetic data analysis, +visual recognition, and open-set detection, we demonstrate the effectiveness of +our methods in quantifying two sources of uncertainties and dealing with +flexible recognition. + +
+
+ comment: Accepted by ICCV23 +
+
+
+
+
+ + ☆ HIGT: Hierarchical Interaction Graph-Transformer for Whole Slide Image + Analysis MICCAI2023 + + +
+ In computation pathology, the pyramid structure of gigapixel Whole Slide +Images (WSIs) has recently been studied for capturing various information from +individual cell interactions to tissue microenvironments. This hierarchical +structure is believed to be beneficial for cancer diagnosis and prognosis +tasks. However, most previous hierarchical WSI analysis works (1) only +characterize local or global correlations within the WSI pyramids and (2) use +only unidirectional interaction between different resolutions, leading to an +incomplete picture of WSI pyramids. To this end, this paper presents a novel +Hierarchical Interaction Graph-Transformer (i.e., HIGT) for WSI analysis. With +Graph Neural Network and Transformer as the building commons, HIGT can learn +both short-range local information and long-range global representation of the +WSI pyramids. Considering that the information from different resolutions is +complementary and can benefit each other during the learning process, we +further design a novel Bidirectional Interaction block to establish +communication between different levels within the WSI pyramids. Finally, we +aggregate both coarse-grained and fine-grained features learned from different +levels together for slide-level prediction. We evaluate our methods on two +public WSI datasets from TCGA projects, i.e., kidney carcinoma (KICA) and +esophageal carcinoma (ESCA). Experimental results show that our HIGT +outperforms both hierarchical and non-hierarchical state-of-the-art methods on +both tumor subtyping and staging tasks. + +
+
+ comment: Accepted by MICCAI2023; Code is available in + https://github.com/HKU-MedAI/HIGT +
+
+
+
+
+ + ☆ Semantic Adversarial Attacks via Diffusion Models BMVC 2023 + + +
+ Traditional adversarial attacks concentrate on manipulating clean examples in +the pixel space by adding adversarial perturbations. By contrast, semantic +adversarial attacks focus on changing semantic attributes of clean examples, +such as color, context, and features, which are more feasible in the real +world. In this paper, we propose a framework to quickly generate a semantic +adversarial attack by leveraging recent diffusion models since semantic +information is included in the latent space of well-trained diffusion models. +Then there are two variants of this framework: 1) the Semantic Transformation +(ST) approach fine-tunes the latent space of the generated image and/or the +diffusion model itself; 2) the Latent Masking (LM) approach masks the latent +space with another target image and local backpropagation-based interpretation +methods. Additionally, the ST approach can be applied in either white-box or +black-box settings. Extensive experiments are conducted on CelebA-HQ and AFHQ +datasets, and our framework demonstrates great fidelity, generalizability, and +transferability compared to other baselines. Our approaches achieve +approximately 100% attack success rate in multiple settings with the best FID +as 36.61. Code is available at +https://github.com/steven202/semantic_adv_via_dm. + +
+
+ comment: To appear in BMVC 2023 +
+
+
+
+
+ + ☆ Nucleus-aware Self-supervised Pretraining Using Unpaired Image-to-image + Translation for Histopathology Images + + +
+ Self-supervised pretraining attempts to enhance model performance by +obtaining effective features from unlabeled data, and has demonstrated its +effectiveness in the field of histopathology images. Despite its success, few +works concentrate on the extraction of nucleus-level information, which is +essential for pathologic analysis. In this work, we propose a novel +nucleus-aware self-supervised pretraining framework for histopathology images. +The framework aims to capture the nuclear morphology and distribution +information through unpaired image-to-image translation between histopathology +images and pseudo mask images. The generation process is modulated by both +conditional and stochastic style representations, ensuring the reality and +diversity of the generated histopathology images for pretraining. Further, an +instance segmentation guided strategy is employed to capture instance-level +information. The experiments on 7 datasets show that the proposed pretraining +method outperforms supervised ones on Kather classification, multiple instance +learning, and 5 dense-prediction tasks with the transfer learning protocol, and +yields superior results than other self-supervised approaches on 8 +semi-supervised tasks. Our project is publicly available at +https://github.com/zhiyuns/UNITPathSSL. + +
+
+
+
+
+ + ☆ Unleashing the Power of Depth and Pose Estimation Neural Networks by + Designing Compatible Endoscopic Images + + +
+ Deep learning models have witnessed depth and pose estimation framework on +unannotated datasets as a effective pathway to succeed in endoscopic +navigation. Most current techniques are dedicated to developing more advanced +neural networks to improve the accuracy. However, existing methods ignore the +special properties of endoscopic images, resulting in an inability to fully +unleash the power of neural networks. In this study, we conduct a detail +analysis of the properties of endoscopic images and improve the compatibility +of images and neural networks, to unleash the power of current neural networks. +First, we introcude the Mask Image Modelling (MIM) module, which inputs partial +image information instead of complete image information, allowing the network +to recover global information from partial pixel information. This enhances the +network' s ability to perceive global information and alleviates the phenomenon +of local overfitting in convolutional neural networks due to local artifacts. +Second, we propose a lightweight neural network to enhance the endoscopic +images, to explicitly improve the compatibility between images and neural +networks. Extensive experiments are conducted on the three public datasets and +one inhouse dataset, and the proposed modules improve baselines by a large +margin. Furthermore, the enhanced images we proposed, which have higher network +compatibility, can serve as an effective data augmentation method and they are +able to extract more stable feature points in traditional feature point +matching tasks and achieve outstanding performance. + +
+
+
+
+
+ + ☆ VDialogUE: A Unified Evaluation Benchmark for Visually-grounded Dialogue + + +
+ Visually-grounded dialog systems, which integrate multiple modes of +communication such as text and visual inputs, have become an increasingly +popular area of investigation. However, the absence of a standardized +evaluation framework poses a challenge in assessing the development of this +field. To this end, we propose \textbf{VDialogUE}, a \textbf{V}isually-grounded +\textbf{Dialog}ue benchmark for \textbf{U}nified \textbf{E}valuation. It +defines five core multi-modal dialogue tasks and covers six datasets. +Furthermore, in order to provide a comprehensive assessment of the model's +performance across all tasks, we developed a novel evaluation metric called +VDscore, which is based on the Analytic Hierarchy Process~(AHP) method. +Additionally, we present a straightforward yet efficient baseline model, named +\textbf{VISIT}~(\textbf{VIS}ually-grounded d\textbf{I}alog +\textbf{T}ransformer), to promote the advancement of general multi-modal +dialogue systems. It progressively builds its multi-modal foundation and +dialogue capability via a two-stage pre-training strategy. + We believe that the VDialogUE benchmark, along with the evaluation scripts +and our baseline models, will accelerate the development of visually-grounded +dialog systems and lead to the development of more sophisticated and effective +pre-trained models. + +
+
+
+
+
+ + ☆ Judging a video by its bitstream cover + + +
+ Classifying videos into distinct categories, such as Sport and Music Video, +is crucial for multimedia understanding and retrieval, especially in an age +where an immense volume of video content is constantly being generated. +Traditional methods require video decompression to extract pixel-level features +like color, texture, and motion, thereby increasing computational and storage +demands. Moreover, these methods often suffer from performance degradation in +low-quality videos. We present a novel approach that examines only the +post-compression bitstream of a video to perform classification, eliminating +the need for bitstream. We validate our approach using a custom-built data set +comprising over 29,000 YouTube video clips, totaling 6,000 hours and spanning +11 distinct categories. Our preliminary evaluations indicate precision, +accuracy, and recall rates well over 80%. The algorithm operates approximately +15,000 times faster than real-time for 30fps videos, outperforming traditional +Dynamic Time Warping (DTW) algorithm by six orders of magnitude. + +
+
+
+
+
+ + ♻ ☆ Contrastive Tuning: A Little Help to Make Masked Autoencoders Forget + + +
+ Masked Image Modeling (MIM) methods, like Masked Autoencoders (MAE), +efficiently learn a rich representation of the input. However, for adapting to +downstream tasks, they require a sufficient amount of labeled data since their +rich features code not only objects but also less relevant image background. In +contrast, Instance Discrimination (ID) methods focus on objects. In this work, +we study how to combine the efficiency and scalability of MIM with the ability +of ID to perform downstream classification in the absence of large amounts of +labeled data. To this end, we introduce Masked Autoencoder Contrastive Tuning +(MAE-CT), a sequential approach that utilizes the implicit clustering of the +Nearest Neighbor Contrastive Learning (NNCLR) objective to induce abstraction +in the topmost layers of a pre-trained MAE. MAE-CT tunes the rich features such +that they form semantic clusters of objects without using any labels. Notably, +MAE-CT does not rely on hand-crafted augmentations and frequently achieves its +best performances while using only minimal augmentations (crop & flip). +Further, MAE-CT is compute efficient as it requires at most 10% overhead +compared to MAE re-training. Applied to large and huge Vision Transformer (ViT) +models, MAE-CT excels over previous self-supervised methods trained on ImageNet +in linear probing, k-NN and low-shot classification accuracy as well as in +unsupervised clustering accuracy. With ViT-H/16 MAE-CT achieves a new +state-of-the-art in linear probing of 82.2%. + +
+
+
+
+
+ + ♻ ☆ Qwen-VL: A Versatile Vision-Language Model for Understanding, + Localization, Text Reading, and Beyond + + +
+ We introduce the Qwen-VL series, a set of large-scale vision-language models +(LVLMs) designed to perceive and understand both text and images. Comprising +Qwen-VL and Qwen-VL-Chat, these models exhibit remarkable performance in tasks +like image captioning, question answering, visual localization, and flexible +interaction. The evaluation covers a wide range of tasks including zero-shot +captioning, visual or document visual question answering, and grounding. We +demonstrate the Qwen-VL outperforms existing LVLMs. We present their +architecture, training, capabilities, and performance, highlighting their +contributions to advancing multimodal artificial intelligence. Code, demo and +models are available at https://github.com/QwenLM/Qwen-VL. + +
+
+ comment: Code, demo and models are available at + https://github.com/QwenLM/Qwen-VL +
+
+
+
+
+ + ♻ ☆ Meta-Learning Regrasping Strategies for Physical-Agnostic Objects ICRA 2022 + + +
+ Grasping inhomogeneous objects in real-world applications remains a +challenging task due to the unknown physical properties such as mass +distribution and coefficient of friction. In this study, we propose a +meta-learning algorithm called ConDex, which incorporates Conditional Neural +Processes (CNP) with DexNet-2.0 to autonomously discern the underlying physical +properties of objects using depth images. ConDex efficiently acquires physical +embeddings from limited trials, enabling precise grasping point estimation. +Furthermore, ConDex is capable of updating the predicted grasping quality +iteratively from new trials in an online fashion. To the best of our knowledge, +we are the first who generate two object datasets focusing on inhomogeneous +physical properties with varying mass distributions and friction coefficients. +Extensive evaluations in simulation demonstrate ConDex's superior performance +over DexNet-2.0 and existing meta-learning-based grasping pipelines. +Furthermore, ConDex shows robust generalization to previously unseen real-world +objects despite training solely in the simulation. The synthetic and real-world +datasets will be published as well. + +
+
+ comment: Accepted as spotlight in ICRA 2022 Workshop: Scaling Robot Learning +
+
+
+
+
+ + ♻ ☆ Domain Generalization for Crop Segmentation with Knowledge Distillation + + +
+ In recent years, precision agriculture has gradually oriented farming closer +to automation processes to support all the activities related to field +management. Service robotics plays a predominant role in this evolution by +deploying autonomous agents that can navigate fields while performing tasks +without human intervention, such as monitoring, spraying, and harvesting. To +execute these precise actions, mobile robots need a real-time perception system +that understands their surroundings and identifies their targets in the wild. +Generalizing to new crops and environmental conditions is critical for +practical applications, as labeled samples are rarely available. In this paper, +we investigate the problem of crop segmentation and propose a novel approach to +enhance domain generalization using knowledge distillation. In the proposed +framework, we transfer knowledge from an ensemble of models individually +trained on source domains to a student model that can adapt to unseen target +domains. To evaluate the proposed method, we present a synthetic multi-domain +dataset for crop segmentation containing plants of variegate shapes and +covering different terrain styles, weather conditions, and light scenarios for +more than 50,000 samples. We demonstrate significant improvements in +performance over state-of-the-art methods and superior sim-to-real +generalization. Our approach provides a promising solution for domain +generalization in crop segmentation and has the potential to enhance a wide +variety of precision agriculture applications. + +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning to Bring Dual Reversed Rolling Shutter Images + Alive ICCV 2023 + + +
+ Modern consumer cameras usually employ the rolling shutter (RS) mechanism, +where images are captured by scanning scenes row-by-row, yielding RS +distortions for dynamic scenes. To correct RS distortions, existing methods +adopt a fully supervised learning manner, where high framerate global shutter +(GS) images should be collected as ground-truth supervision. In this paper, we +propose a Self-supervised learning framework for Dual reversed RS distortions +Correction (SelfDRSC), where a DRSC network can be learned to generate a high +framerate GS video only based on dual RS images with reversed distortions. In +particular, a bidirectional distortion warping module is proposed for +reconstructing dual reversed RS images, and then a self-supervised loss can be +deployed to train DRSC network by enhancing the cycle consistency between input +and reconstructed dual reversed RS images. Besides start and end RS scanning +time, GS images at arbitrary intermediate scanning time can also be supervised +in SelfDRSC, thus enabling the learned DRSC network to generate a high +framerate GS video. Moreover, a simple yet effective self-distillation strategy +is introduced in self-supervised loss for mitigating boundary artifacts in +generated GS images. On synthetic dataset, SelfDRSC achieves better or +comparable quantitative metrics in comparison to state-of-the-art methods +trained in the full supervision manner. On real-world RS cases, our SelfDRSC +can produce high framerate GS videos with finer correction textures and better +temporary consistency. The source code and trained models are made publicly +available at https://github.com/shangwei5/SelfDRSC. We also provide an +implementation in HUAWEI Mindspore at +https://github.com/Hunter-Will/SelfDRSC-mindspore. + +
+
+ comment: Accepted by ICCV 2023, available at + https://github.com/shangwei5/SelfDRSC +
+
+
+
+
+ + ♻ ☆ Exploiting Point-Wise Attention in 6D Object Pose Estimation Based on + Bidirectional Prediction + + +
+ Traditional geometric registration based estimation methods only exploit the +CAD model implicitly, which leads to their dependence on observation quality +and deficiency to occlusion. To address the problem,the paper proposes a +bidirectional correspondence prediction network with a point-wise +attention-aware mechanism. This network not only requires the model points to +predict the correspondence but also explicitly models the geometric +similarities between observations and the model prior. Our key insight is that +the correlations between each model point and scene point provide essential +information for learning point-pair matches. To further tackle the correlation +noises brought by feature distribution divergence, we design a simple but +effective pseudo-siamese network to improve feature homogeneity. Experimental +results on the public datasets of LineMOD, YCB-Video, and Occ-LineMOD show that +the proposed method achieves better performance than other state-of-the-art +methods under the same evaluation criteria. Its robustness in estimating poses +is greatly improved, especially in an environment with severe occlusions. + +
+
+
+
+
+ + ♻ ☆ Position-Enhanced Visual Instruction Tuning for Multimodal Large + Language Models + + +
+ Recently, Multimodal Large Language Models (MLLMs) that enable Large Language +Models (LLMs) to interpret images through visual instruction tuning have +achieved significant success. However, existing visual instruction tuning +methods only utilize image-language instruction data to align the language and +image modalities, lacking a more fine-grained cross-modal alignment. In this +paper, we propose Position-enhanced Visual Instruction Tuning (PVIT), which +extends the functionality of MLLMs by integrating an additional region-level +vision encoder. This integration promotes a more detailed comprehension of +images for the MLLM. In addition, to efficiently achieve a fine-grained +alignment between the vision modules and the LLM, we design multiple data +generation strategies to construct an image-region-language instruction +dataset. Finally, we present both quantitative experiments and qualitative +analysis that demonstrate the superiority of the proposed model. Code and data +will be released at https://github.com/PVIT-official/PVIT. + +
+
+
+
+
+ + ♻ ☆ Machine Learning and Computer Vision Techniques in Continuous Beehive + Monitoring Applications: A survey + + +
+ Wide use and availability of the machine learning and computer vision +techniques allows development of relatively complex monitoring systems in many +domains. Besides the traditional industrial domain, new application appears +also in biology and agriculture, where we could speak about the detection of +infections, parasites and weeds, but also about automated monitoring and early +warning systems. This is also connected with the introduction of the easily +accessible hardware and development kits such as Arduino, or RaspberryPi +family. In this paper, we survey 50 existing papers focusing on the methods of +automated beehive monitoring methods using the computer vision techniques, +particularly on the pollen and Varroa mite detection together with the bee +traffic monitoring. Such systems could also be used for the monitoring of the +honeybee colonies and for the inspection of their health state, which could +identify potentially dangerous states before the situation is critical, or to +better plan periodic bee colony inspections and therefore save significant +costs. Later, we also include analysis of the research trends in this +application field and we outline the possible direction of the new +explorations. Our paper is aimed also at veterinary and apidology professionals +and experts, who might not be familiar with machine learning to introduce them +to its possibilities, therefore each family of applications is opened by a +brief theoretical introduction and motivation related to its base method. We +hope that this paper will inspire other scientists to use machine learning +techniques for other applications in beehive monitoring. + +
+
+
+
+
+ + ♻ ☆ Mitigating Hallucination in Large Multi-Modal Models via Robust + Instruction Tuning + + +
+ Despite the promising progress in multi-modal tasks, current large +multi-modal models (LMM) are prone to hallucinating inconsistent descriptions +with respect to the associated image and human instructions. This paper +addresses this issue by introducing the first large and diverse visual +instruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction. +Our dataset consists of 120k visual instructions generated by GPT4, covering 16 +vision-and-language tasks with open-ended instructions and answers. Unlike +existing studies that primarily focus on positive instruction samples, we +design LRV-Instruction to include both positive and negative instructions for +more robust visual instruction tuning. Our negative instructions are designed +at two semantic levels: (i) Nonexistent Element Manipulation and (ii) Existent +Element Manipulation. To efficiently measure the hallucination generated by +LMMs, we propose GPT4-Assisted Visual Instruction Evaluation (GAVIE), a novel +approach to evaluate visual instruction tuning without the need for +human-annotated groundtruth answers and can adapt to diverse instruction +formats. We conduct comprehensive experiments to investigate the hallucination +of LMMs. Our results demonstrate that existing LMMs exhibit significant +hallucination when presented with our negative instructions, particularly with +Existent Element Manipulation instructions. Moreover, by finetuning MiniGPT4 on +LRV-Instruction, we successfully mitigate hallucination while improving +performance on public datasets using less training data compared to +state-of-the-art methods. Additionally, we observed that a balanced ratio of +positive and negative instances in the training data leads to a more robust +model. Updates of our project are available at +https://fuxiaoliu.github.io/LRV/. + +
+
+ comment: 35 pages, 27 figures. Under Review +
+
+
+
+
+ + ♻ ☆ A Localization-to-Segmentation Framework for Automatic Tumor + Segmentation in Whole-Body PET/CT Images + + +
+ Fluorodeoxyglucose (FDG) positron emission tomography (PET) combined with +computed tomography (CT) is considered the primary solution for detecting some +cancers, such as lung cancer and melanoma. Automatic segmentation of tumors in +PET/CT images can help reduce doctors' workload, thereby improving diagnostic +quality. However, precise tumor segmentation is challenging due to the small +size of many tumors and the similarity of high-uptake normal areas to the tumor +regions. To address these issues, this paper proposes a +localization-to-segmentation framework (L2SNet) for precise tumor segmentation. +L2SNet first localizes the possible lesions in the lesion localization phase +and then uses the location cues to shape the segmentation results in the lesion +segmentation phase. To further improve the segmentation performance of L2SNet, +we design an adaptive threshold scheme that takes the segmentation results of +the two phases into consideration. The experiments with the MICCAI 2023 +Automated Lesion Segmentation in Whole-Body FDG-PET/CT challenge dataset show +that our method achieved a competitive result and was ranked in the top 7 +methods on the preliminary test set. Our work is available at: +https://github.com/MedCAI/L2SNet. + +
+
+ comment: 7 pages,3 figures +
+
+
+
+
+ + ♻ ☆ GBE-MLZSL: A Group Bi-Enhancement Framework for Multi-Label Zero-Shot + Learning + + +
+ This paper investigates a challenging problem of zero-shot learning in the +multi-label scenario (MLZSL), wherein, the model is trained to recognize +multiple unseen classes within a sample (e.g., an image) based on seen classes +and auxiliary knowledge, e.g., semantic information. Existing methods usually +resort to analyzing the relationship of various seen classes residing in a +sample from the dimension of spatial or semantic characteristics, and transfer +the learned model to unseen ones. But they ignore the effective integration of +local and global features. That is, in the process of inferring unseen classes, +global features represent the principal direction of the image in the feature +space, while local features should maintain uniqueness within a certain range. +This integrated neglect will make the model lose its grasp of the main +components of the image. Relying only on the local existence of seen classes +during the inference stage introduces unavoidable bias. In this paper, we +propose a novel and effective group bi-enhancement framework for MLZSL, dubbed +GBE-MLZSL, to fully make use of such properties and enable a more accurate and +robust visual-semantic projection. Specifically, we split the feature maps into +several feature groups, of which each feature group can be trained +independently with the Local Information Distinguishing Module (LID) to ensure +uniqueness. Meanwhile, a Global Enhancement Module (GEM) is designed to +preserve the principal direction. Besides, a static graph structure is designed +to construct the correlation of local features. Experiments on large-scale +MLZSL benchmark datasets NUS-WIDE and Open-Images-v4 demonstrate that the +proposed GBE-MLZSL outperforms other state-of-the-art methods with large +margins. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Towards Language-guided Visual Recognition via Dynamic Convolutions + + +
+ In this paper, we are committed to establishing an unified and end-to-end +multi-modal network via exploring the language-guided visual recognition. To +approach this target, we first propose a novel multi-modal convolution module +called Language-dependent Convolution (LaConv). Its convolution kernels are +dynamically generated based on natural language information, which can help +extract differentiated visual features for different multi-modal examples. +Based on the LaConv module, we further build the first fully language-driven +convolution network, termed as LaConvNet, which can unify the visual +recognition and multi-modal reasoning in one forward structure. To validate +LaConv and LaConvNet, we conduct extensive experiments on four benchmark +datasets of two vision-and-language tasks, i.e., visual question answering +(VQA) and referring expression comprehension (REC). The experimental results +not only shows the performance gains of LaConv compared to the existing +multi-modal modules, but also witness the merits of LaConvNet as an unified +network, including compact network, high generalization ability and excellent +performance, e.g., +4.7% on RefCOCO+. + +
+
+
+
+
+ + ♻ ☆ A Survivor in the Era of Large-Scale Pretraining: An Empirical Study of + One-Stage Referring Expression Comprehension + + +
+ Most of the existing work in one-stage referring expression comprehension +(REC) mainly focuses on multi-modal fusion and reasoning, while the influence +of other factors in this task lacks in-depth exploration. To fill this gap, we +conduct an empirical study in this paper. Concretely, we first build a very +simple REC network called SimREC, and ablate 42 candidate designs/settings, +which covers the entire process of one-stage REC from network design to model +training. Afterwards, we conduct over 100 experimental trials on three +benchmark datasets of REC. The extensive experimental results not only show the +key factors that affect REC performance in addition to multi-modal fusion, +e.g., multi-scale features and data augmentation, but also yield some findings +that run counter to conventional understanding. For example, as a vision and +language (V&L) task, REC does is less impacted by language prior. In addition, +with a proper combination of these findings, we can improve the performance of +SimREC by a large margin, e.g., +27.12% on RefCOCO+, which outperforms all +existing REC methods. But the most encouraging finding is that with much less +training overhead and parameters, SimREC can still achieve better performance +than a set of large-scale pre-trained models, e.g., UNITER and VILLA, +portraying the special role of REC in existing V&L research. + +
+
+
+
+
+ + ♻ ☆ Far Away in the Deep Space: Dense Nearest-Neighbor-Based + Out-of-Distribution Detection ICCV + 2023 + + +
+ The key to out-of-distribution detection is density estimation of the +in-distribution data or of its feature representations. This is particularly +challenging for dense anomaly detection in domains where the in-distribution +data has a complex underlying structure. Nearest-Neighbors approaches have been +shown to work well in object-centric data domains, such as industrial +inspection and image classification. In this paper, we show that +nearest-neighbor approaches also yield state-of-the-art results on dense +novelty detection in complex driving scenes when working with an appropriate +feature representation. In particular, we find that transformer-based +architectures produce representations that yield much better similarity metrics +for the task. We identify the multi-head structure of these models as one of +the reasons, and demonstrate a way to transfer some of the improvements to +CNNs. Ultimately, the approach is simple and non-invasive, i.e., it does not +affect the primary segmentation performance, refrains from training on examples +of anomalies, and achieves state-of-the-art results on RoadAnomaly, +StreetHazards, and SegmentMeIfYouCan-Anomaly. + +
+
+ comment: Workshop on Uncertainty Quantification for Computer Vision, ICCV + 2023. Code at: https://github.com/silviogalesso/dense-ood-knns +
+
+
+
+
+ + ♻ ☆ Few Shot Medical Image Segmentation with Cross Attention Transformer MICCAI 2023 + + +
+ Medical image segmentation has made significant progress in recent years. +Deep learning-based methods are recognized as data-hungry techniques, requiring +large amounts of data with manual annotations. However, manual annotation is +expensive in the field of medical image analysis, which requires +domain-specific expertise. To address this challenge, few-shot learning has the +potential to learn new classes from only a few examples. In this work, we +propose a novel framework for few-shot medical image segmentation, termed +CAT-Net, based on cross masked attention Transformer. Our proposed network +mines the correlations between the support image and query image, limiting them +to focus only on useful foreground information and boosting the representation +capacity of both the support prototype and query features. We further design an +iterative refinement framework that refines the query image segmentation +iteratively and promotes the support feature in turn. We validated the proposed +method on three public datasets: Abd-CT, Abd-MRI, and Card-MRI. Experimental +results demonstrate the superior performance of our method compared to +state-of-the-art methods and the effectiveness of each component. Code: +https://github.com/hust-linyi/CAT-Net. + +
+
+ comment: Accepted by MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Interpretable Weighted Siamese Network to Predict the Time to Onset of + Alzheimer's Disease from MRI Images + + +
+ Alzheimer's Disease (AD) is a progressive disease preceded by Mild Cognitive +Impairment (MCI). Early detection of AD is crucial for making treatment +decisions. However, most of the literature on computer-assisted detection of AD +focuses on classifying brain images into one of three major categories: +healthy, MCI, and AD; or categorizing MCI patients into (1) progressive: those +who progress from MCI to AD at a future examination time, and (2) stable: those +who stay as MCI and never progress to AD. This misses the opportunity to +accurately identify the trajectory of progressive MCI patients. In this paper, +we revisit the brain image classification task for AD identification and +re-frame it as an ordinal classification task to predict how close a patient is +to the severe AD stage. To this end, we select progressive MCI patients from +the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset and construct an +ordinal dataset with a prediction target that indicates the time to progression +to AD. We train a Siamese network model to predict the time to onset of AD +based on MRI brain images. We also propose a Weighted variety of Siamese +network and compare its performance to a baseline model. Our evaluations show +that incorporating a weighting factor to Siamese networks brings considerable +performance gain at predicting how close input brain MRI images are to +progressing to AD. Moreover, we complement our results with an interpretation +of the learned embedding space of the Siamese networks using a model +explainability technique. + +
+
+ comment: Accepted at the Specialist Group on Artificial Intelligence, SGAI + 2023, conference +
+
+
+
+
+ + ♻ ☆ Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI + Generation and Diffuse Glioma Growth Prediction + + +
+ Diffuse gliomas are malignant brain tumors that grow widespread through the +brain. The complex interactions between neoplastic cells and normal tissue, as +well as the treatment-induced changes often encountered, make glioma tumor +growth modeling challenging. In this paper, we present a novel end-to-end +network capable of generating future tumor masks and realistic MRIs of how the +tumor will look at any future time points for different treatment plans. Our +approach is based on cutting-edge diffusion probabilistic models and +deep-segmentation neural networks. We included sequential multi-parametric +magnetic resonance images (MRI) and treatment information as conditioning +inputs to guide the generative diffusion process. This allows for tumor growth +estimates at any given time point. We trained the model using real-world +postoperative longitudinal MRI data with glioma tumor growth trajectories +represented as tumor segmentation maps over time. The model has demonstrated +promising performance across a range of tasks, including the generation of +high-quality synthetic MRIs with tumor masks, time-series tumor segmentations, +and uncertainty estimates. Combined with the treatment-aware generated MRIs, +the tumor growth predictions with uncertainty estimates can provide useful +information for clinical decision-making. + +
+
+ comment: 13 pages, 10 figures, 2 tables, 2 agls, preprints in the IEEE trans. + format for submission to IEEE-TMI +
+
+
+
+
+ + ♻ ☆ CCSPNet-Joint: Efficient Joint Training Method for Traffic Sign + Detection Under Extreme Conditions + + +
+ Traffic sign detection is an important research direction in intelligent +driving. Unfortunately, existing methods often overlook extreme conditions such +as fog, rain, and motion blur. Moreover, the end-to-end training strategy for +image denoising and object detection models fails to utilize inter-model +information effectively. To address these issues, we propose CCSPNet, an +efficient feature extraction module based on Transformers and CNNs, which +effectively leverages contextual information, achieves faster inference speed +and provides stronger feature enhancement capabilities. Furthermore, we +establish the correlation between object detection and image denoising tasks +and propose a joint training model, CCSPNet-Joint, to improve data efficiency +and generalization. Finally, to validate our approach, we create the CCTSDB-AUG +dataset for traffic sign detection in extreme scenarios. Extensive experiments +have shown that CCSPNet achieves state-of-the-art performance in traffic sign +detection under extreme conditions. Compared to end-to-end methods, +CCSPNet-Joint achieves a 5.32% improvement in precision and an 18.09% +improvement in mAP@.5. + +
+
+
+
+
+ + ♻ ☆ AvatarFusion: Zero-shot Generation of Clothing-Decoupled 3D Avatars + Using 2D Diffusion + + +
+ Large-scale pre-trained vision-language models allow for the zero-shot +text-based generation of 3D avatars. The previous state-of-the-art method +utilized CLIP to supervise neural implicit models that reconstructed a human +body mesh. However, this approach has two limitations. Firstly, the lack of +avatar-specific models can cause facial distortion and unrealistic clothing in +the generated avatars. Secondly, CLIP only provides optimization direction for +the overall appearance, resulting in less impressive results. To address these +limitations, we propose AvatarFusion, the first framework to use a latent +diffusion model to provide pixel-level guidance for generating human-realistic +avatars while simultaneously segmenting clothing from the avatar's body. +AvatarFusion includes the first clothing-decoupled neural implicit avatar model +that employs a novel Dual Volume Rendering strategy to render the decoupled +skin and clothing sub-models in one space. We also introduce a novel +optimization method, called Pixel-Semantics Difference-Sampling (PS-DS), which +semantically separates the generation of body and clothes, and generates a +variety of clothing styles. Moreover, we establish the first benchmark for +zero-shot text-to-avatar generation. Our experimental results demonstrate that +our framework outperforms previous approaches, with significant improvements +observed in all metrics. Additionally, since our model is clothing-decoupled, +we can exchange the clothes of avatars. Code are available on our project page +https://hansenhuang0823.github.io/AvatarFusion. + +
+
+ comment: Accepted by ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ NeFII: Inverse Rendering for Reflectance Decomposition with Near-Field + Indirect Illumination CVPR 2023 + + +
+ Inverse rendering methods aim to estimate geometry, materials and +illumination from multi-view RGB images. In order to achieve better +decomposition, recent approaches attempt to model indirect illuminations +reflected from different materials via Spherical Gaussians (SG), which, +however, tends to blur the high-frequency reflection details. In this paper, we +propose an end-to-end inverse rendering pipeline that decomposes materials and +illumination from multi-view images, while considering near-field indirect +illumination. In a nutshell, we introduce the Monte Carlo sampling based path +tracing and cache the indirect illumination as neural radiance, enabling a +physics-faithful and easy-to-optimize inverse rendering method. To enhance +efficiency and practicality, we leverage SG to represent the smooth environment +illuminations and apply importance sampling techniques. To supervise indirect +illuminations from unobserved directions, we develop a novel radiance +consistency constraint between implicit neural radiance and path tracing +results of unobserved rays along with the joint optimization of materials and +illuminations, thus significantly improving the decomposition performance. +Extensive experiments demonstrate that our method outperforms the +state-of-the-art on multiple synthetic and real datasets, especially in terms +of inter-reflection decomposition.Our code and data are available at +https://woolseyyy.github.io/nefii/. + +
+
+ comment: Accepted in CVPR 2023 +
+
+
+
+
+ + ♻ ☆ RigNet++: Efficient Repetitive Image Guided Network for Depth Completion + + +
+ Depth completion aims to recover dense depth maps from sparse ones, where +color images are often used to facilitate this task. Recent depth methods +primarily focus on image guided learning frameworks. However, blurry guidance +in the image and unclear structure in the depth still impede their performance. +To tackle these challenges, we explore an efficient repetitive design in our +image guided network to gradually and sufficiently recover depth values. +Specifically, the efficient repetition is embodied in both the image guidance +branch and depth generation branch. In the former branch, we design a dense +repetitive hourglass network to extract discriminative image features of +complex environments, which can provide powerful contextual instruction for +depth prediction. In the latter branch, we introduce a repetitive guidance +module based on dynamic convolution, in which an efficient convolution +factorization is proposed to reduce the complexity while modeling +high-frequency structures progressively. Extensive experiments indicate that +our approach achieves superior or competitive results on KITTI, VKITTI, NYUv2, +3D60, and Matterport3D datasets. + +
+
+ comment: 15 pages. arXiv admin note: text overlap with arXiv:2107.13802 +
+
+
+
+
+ + ♻ ☆ Robust Saliency-Aware Distillation for Few-shot Fine-grained Visual + Recognition + + +
+ Recognizing novel sub-categories with scarce samples is an essential and +challenging research topic in computer vision. Existing literature addresses +this challenge by employing local-based representation approaches, which may +not sufficiently facilitate meaningful object-specific semantic understanding, +leading to a reliance on apparent background correlations. Moreover, they +primarily rely on high-dimensional local descriptors to construct complex +embedding space, potentially limiting the generalization. To address the above +challenges, this article proposes a novel model called RSaG for few-shot +fine-grained visual recognition. RSaG introduces additional saliency-aware +supervision via saliency detection to guide the model toward focusing on the +intrinsic discriminative regions. Specifically, RSaG utilizes the saliency +detection model to emphasize the critical regions of each sub-category, +providing additional object-specific information for fine-grained prediction. +RSaG transfers such information with two symmetric branches in a mutual +learning paradigm. Furthermore, RSaG exploits inter-regional relationships to +enhance the informativeness of the representation and subsequently summarize +the highlighted details into contextual embeddings to facilitate the effective +transfer, enabling quick generalization to novel sub-categories. The proposed +approach is empirically evaluated on three widely used benchmarks, +demonstrating its superior performance. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ VEATIC: Video-based Emotion and Affect Tracking in Context Dataset + + +
+ Human affect recognition has been a significant topic in psychophysics and +computer vision. However, the currently published datasets have many +limitations. For example, most datasets contain frames that contain only +information about facial expressions. Due to the limitations of previous +datasets, it is very hard to either understand the mechanisms for affect +recognition of humans or generalize well on common cases for computer vision +models trained on those datasets. In this work, we introduce a brand new large +dataset, the Video-based Emotion and Affect Tracking in Context Dataset +(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has +124 video clips from Hollywood movies, documentaries, and home videos with +continuous valence and arousal ratings of each frame via real-time annotation. +Along with the dataset, we propose a new computer vision task to infer the +affect of the selected character via both context and character information in +each video frame. Additionally, we propose a simple model to benchmark this new +computer vision task. We also compare the performance of the pretrained model +using our dataset with other similar datasets. Experiments show the competing +results of our pretrained model via VEATIC, indicating the generalizability of +VEATIC. Our dataset is available at https://veatic.github.io. + +
+
+
+
+
+ + ♻ ☆ Instance Adaptive Prototypical Contrastive Embedding for Generalized + Zero Shot Learning IJCAI 2023 + + +
+ Generalized zero-shot learning(GZSL) aims to classify samples from seen and +unseen labels, assuming unseen labels are not accessible during training. +Recent advancements in GZSL have been expedited by incorporating +contrastive-learning-based (instance-based) embedding in generative networks +and leveraging the semantic relationship between data points. However, existing +embedding architectures suffer from two limitations: (1) limited +discriminability of synthetic features' embedding without considering +fine-grained cluster structures; (2) inflexible optimization due to restricted +scaling mechanisms on existing contrastive embedding networks, leading to +overlapped representations in the embedding space. To enhance the quality of +representations in the embedding space, as mentioned in (1), we propose a +margin-based prototypical contrastive learning embedding network that reaps the +benefits of prototype-data (cluster quality enhancement) and implicit data-data +(fine-grained representations) interaction while providing substantial cluster +supervision to the embedding network and the generator. To tackle (2), we +propose an instance adaptive contrastive loss that leads to generalized +representations for unseen labels with increased inter-class margin. Through +comprehensive experimental evaluation, we show that our method can outperform +the current state-of-the-art on three benchmark datasets. Our approach also +consistently achieves the best unseen performance in the GZSL setting. + +
+
+ comment: 7 pages, 4 figures. Accepted in IJCAI 2023 Workshop on Generalizing + from Limited Resources in the Open World +
+
+
+
+
+ + ♻ ☆ Reasoning with Language Model Prompting: A Survey ACL 2023 + + +
+ Reasoning, as an essential ability for complex problem-solving, can provide +back-end support for various real-world applications, such as medical +diagnosis, negotiation, etc. This paper provides a comprehensive survey of +cutting-edge research on reasoning with language model prompting. We introduce +research works with comparisons and summaries and provide systematic resources +to help beginners. We also discuss the potential reasons for emerging such +reasoning abilities and highlight future research directions. Resources are +available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated +periodically). + +
+
+ comment: ACL 2023, 24 pages, add references of theoretical analysis +
+
+
+
+
+ + ♻ ☆ Few shot font generation via transferring similarity guided global style + and quantization local style ICCV 2023 + + +
+ Automatic few-shot font generation (AFFG), aiming at generating new fonts +with only a few glyph references, reduces the labor cost of manually designing +fonts. However, the traditional AFFG paradigm of style-content disentanglement +cannot capture the diverse local details of different fonts. So, many +component-based approaches are proposed to tackle this problem. The issue with +component-based approaches is that they usually require special pre-defined +glyph components, e.g., strokes and radicals, which is infeasible for AFFG of +different languages. In this paper, we present a novel font generation approach +by aggregating styles from character similarity-guided global features and +stylized component-level representations. We calculate the similarity scores of +the target character and the referenced samples by measuring the distance along +the corresponding channels from the content features, and assigning them as the +weights for aggregating the global style features. To better capture the local +styles, a cross-attention-based style transfer module is adopted to transfer +the styles of reference glyphs to the components, where the components are +self-learned discrete latent codes through vector quantization without manual +definition. With these designs, our AFFG method could obtain a complete set of +component-level style representations, and also control the global glyph +characteristics. The experimental results reflect the effectiveness and +generalization of the proposed method on different linguistic scripts, and also +show its superiority when compared with other state-of-the-art methods. The +source code can be found at https://github.com/awei669/VQ-Font. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Economical Quaternion Extraction from a Human Skeletal Pose Estimate + using 2-D Cameras + + +
+ In this paper, we present a novel algorithm to extract a quaternion from a +two dimensional camera frame for estimating a contained human skeletal pose. +The problem of pose estimation is usually tackled through the usage of stereo +cameras and intertial measurement units for obtaining depth and euclidean +distance for measurement of points in 3D space. However, the usage of these +devices comes with a high signal processing latency as well as a significant +monetary cost. By making use of MediaPipe, a framework for building perception +pipelines for human pose estimation, the proposed algorithm extracts a +quaternion from a 2-D frame capturing an image of a human object at a sub-fifty +millisecond latency while also being capable of deployment at edges with a +single camera frame and a generally low computational resource availability, +especially for use cases involving last-minute detection and reaction by +autonomous robots. The algorithm seeks to bypass the funding barrier and +improve accessibility for robotics researchers involved in designing control +systems. + +
+
+ comment: This is the post-final version of the paper published with IEEE + CONECCT 2023 with some figure reference errors rectified +
+
+
+
+
+ + ♻ ☆ MER 2023: Multi-label Learning, Modality Robustness, and Semi-Supervised + Learning + + +
+ The first Multimodal Emotion Recognition Challenge (MER 2023) was +successfully held at ACM Multimedia. The challenge focuses on system robustness +and consists of three distinct tracks: (1) MER-MULTI, where participants are +required to recognize both discrete and dimensional emotions; (2) MER-NOISE, in +which noise is added to test videos for modality robustness evaluation; (3) +MER-SEMI, which provides a large amount of unlabeled samples for +semi-supervised learning. In this paper, we introduce the motivation behind +this challenge, describe the benchmark dataset, and provide some statistics +about participants. To continue using this dataset after MER 2023, please sign +a new End User License Agreement and send it to our official email address +merchallenge.contact@gmail.com. We believe this high-quality dataset can become +a new benchmark in multimodal emotion recognition, especially for the Chinese +research community. + +
+
+
+
+
+ + ♻ ☆ Preventing Unauthorized AI Over-Analysis by Medical Image Adversarial + Watermarking + + +
+ The advancement of deep learning has facilitated the integration of +Artificial Intelligence (AI) into clinical practices, particularly in +computer-aided diagnosis. Given the pivotal role of medical images in various +diagnostic procedures, it becomes imperative to ensure the responsible and +secure utilization of AI techniques. However, the unauthorized utilization of +AI for image analysis raises significant concerns regarding patient privacy and +potential infringement on the proprietary rights of data custodians. +Consequently, the development of pragmatic and cost-effective strategies that +safeguard patient privacy and uphold medical image copyrights emerges as a +critical necessity. In direct response to this pressing demand, we present a +pioneering solution named Medical Image Adversarial watermarking (MIAD-MARK). +Our approach introduces watermarks that strategically mislead unauthorized AI +diagnostic models, inducing erroneous predictions without compromising the +integrity of the visual content. Importantly, our method integrates an +authorization protocol tailored for legitimate users, enabling the removal of +the MIAD-MARK through encryption-generated keys. Through extensive experiments, +we validate the efficacy of MIAD-MARK across three prominent medical image +datasets. The empirical outcomes demonstrate the substantial impact of our +approach, notably reducing the accuracy of standard AI diagnostic models to a +mere 8.57% under white box conditions and 45.83% in the more challenging black +box scenario. Additionally, our solution effectively mitigates unauthorized +exploitation of medical images even in the presence of sophisticated watermark +removal networks. Notably, those AI diagnosis networks exhibit a meager average +accuracy of 38.59% when applied to images protected by MIAD-MARK, underscoring +the robustness of our safeguarding mechanism. + +
+
+
+
+
+ + ♻ ☆ Deep Nonparametric Convexified Filtering for Computational Photography, + Image Synthesis and Adversarial Defense + + +
+ We aim to provide a general framework of for computational photography that +recovers the real scene from imperfect images, via the Deep Nonparametric +Convexified Filtering (DNCF). It is consists of a nonparametric deep network to +resemble the physical equations behind the image formation, such as denoising, +super-resolution, inpainting, and flash. DNCF has no parameterization dependent +on training data, therefore has a strong generalization and robustness to +adversarial image manipulation. During inference, we also encourage the network +parameters to be nonnegative and create a bi-convex function on the input and +parameters, and this adapts to second-order optimization algorithms with +insufficient running time, having 10X acceleration over Deep Image Prior. With +these tools, we empirically verify its capability to defend image +classification deep networks against adversary attack algorithms in real-time. + +
+
+
+
+
+ + ♻ ☆ SimpleNeRF: Regularizing Sparse Input Neural Radiance Fields with + Simpler Solutions SIGGRAPH + + +
+ Neural Radiance Fields (NeRF) show impressive performance for the +photorealistic free-view rendering of scenes. However, NeRFs require dense +sampling of images in the given scene, and their performance degrades +significantly when only a sparse set of views are available. Researchers have +found that supervising the depth estimated by the NeRF helps train it +effectively with fewer views. The depth supervision is obtained either using +classical approaches or neural networks pre-trained on a large dataset. While +the former may provide only sparse supervision, the latter may suffer from +generalization issues. As opposed to the earlier approaches, we seek to learn +the depth supervision by designing augmented models and training them along +with the NeRF. We design augmented models that encourage simpler solutions by +exploring the role of positional encoding and view-dependent radiance in +training the few-shot NeRF. The depth estimated by these simpler models is used +to supervise the NeRF depth estimates. Since the augmented models can be +inaccurate in certain regions, we design a mechanism to choose only reliable +depth estimates for supervision. Finally, we add a consistency loss between the +coarse and fine multi-layer perceptrons of the NeRF to ensure better +utilization of hierarchical sampling. We achieve state-of-the-art +view-synthesis performance on two popular datasets by employing the above +regularizations. The source code for our model can be found on our project +page: https://nagabhushansn95.github.io/publications/2023/SimpleNeRF.html + +
+
+ comment: SIGGRAPH Asia 2023 +
+
+
+
+
+ + ♻ ☆ Co-Teaching for Unsupervised Domain Adaptation and Expansion + + +
+ Unsupervised Domain Adaptation (UDA) essentially trades a model's performance +on a source domain for improving its performance on a target domain. To resolve +the issue, Unsupervised Domain Expansion (UDE) has been proposed recently. UDE +tries to adapt the model for the target domain as UDA does, and in the meantime +maintains its source-domain performance. In both UDA and UDE settings, a model +tailored to a given domain, let it be the source or the target domain, is +assumed to well handle samples from the given domain. We question the +assumption by reporting the existence of cross-domain visual ambiguity: Given +the lack of a crystally clear boundary between the two domains, samples from +one domain can be visually close to the other domain. Such sorts of samples are +typically in minority in their host domain, so they tend to be overlooked by +the domain-specific model, but can be better handled by a model from the other +domain. We exploit this finding, and accordingly propose Co-Teaching (CT). The +CT method is instantiated with knowledge distillation based CT (kdCT) plus +mixup based CT (miCT). Specifically, kdCT transfers knowledge from a +leading-teacher network and an assistant-teacher network to a student network, +so the cross-domain ambiguity will be better handled by the student. Meanwhile, +miCT further enhances the generalization ability of the student. Extensive +experiments on two image classification datasets and two driving-scene +segmentation datasets justify the viability of CT for UDA and UDE. + +
+
+
+
+
+ + ♻ ☆ ShaDocFormer: A Shadow-attentive Threshold Detector with Cascaded Fusion + Refiner for document shadow removal + + +
+ Document shadow is a common issue that arise when capturing documents using +mobile devices, which significantly impacts the readability. Current methods +encounter various challenges including inaccurate detection of shadow masks and +estimation of illumination. In this paper, we propose ShaDocFormer, a +Transformer-based architecture that integrates traditional methodologies and +deep learning techniques to tackle the problem of document shadow removal. The +ShaDocFormer architecture comprises two components: the Shadow-attentive +Threshold Detector (STD) and the Cascaded Fusion Refiner (CFR). The STD module +employs a traditional thresholding technique and leverages the attention +mechanism of the Transformer to gather global information, thereby enabling +precise detection of shadow masks. The cascaded and aggregative structure of +the CFR module facilitates a coarse-to-fine restoration process for the entire +image. As a result, ShaDocFormer excels in accurately detecting and capturing +variations in both shadow and illumination, thereby enabling effective removal +of shadows. Extensive experiments demonstrate that ShaDocFormer outperforms +current state-of-the-art methods in both qualitative and quantitative +measurements. + +
+
+
+
+
+ + ♻ ☆ RestNet: Boosting Cross-Domain Few-Shot Segmentation with Residual + Transformation Network BMVC 2023 + + +
+ Cross-domain few-shot segmentation (CD-FSS) aims to achieve semantic +segmentation in previously unseen domains with a limited number of annotated +samples. Although existing CD-FSS models focus on cross-domain feature +transformation, relying exclusively on inter-domain knowledge transfer may lead +to the loss of critical intra-domain information. To this end, we propose a +novel residual transformation network (RestNet) that facilitates knowledge +transfer while retaining the intra-domain support-query feature information. +Specifically, we propose a Semantic Enhanced Anchor Transform (SEAT) module +that maps features to a stable domain-agnostic space using advanced semantics. +Additionally, an Intra-domain Residual Enhancement (IRE) module is designed to +maintain the intra-domain representation of the original discriminant space in +the new space. We also propose a mask prediction strategy based on prototype +fusion to help the model gradually learn how to segment. Our RestNet can +transfer cross-domain knowledge from both inter-domain and intra-domain without +requiring additional fine-tuning. Extensive experiments on ISIC, Chest X-ray, +and FSS-1000 show that our RestNet achieves state-of-the-art performance. Our +code will be available soon. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Ambiguity-Aware In-Context Learning with Large Language Models + + +
+ In-context learning (ICL) i.e. showing LLMs only a few task-specific +demonstrations has led to downstream gains with no task-specific fine-tuning +required. However, LLMs are sensitive to the choice of prompts, and therefore a +crucial research question is how to select good demonstrations for ICL. One +effective strategy is leveraging semantic similarity between the ICL +demonstrations and test inputs by using a text retriever, which however is +sub-optimal as that does not consider the LLM's existing knowledge about that +task. From prior work (Min et al., 2022), we already know that labels paired +with the demonstrations bias the model predictions. This leads us to our +hypothesis whether considering LLM's existing knowledge about the task, +especially with respect to the output label space can help in a better +demonstration selection strategy. Through extensive experimentation on three +text classification tasks, we find that it is beneficial to not only choose +semantically similar ICL demonstrations but also to choose those demonstrations +that help resolve the inherent label ambiguity surrounding the test example. +Interestingly, we find that including demonstrations that the LLM previously +mis-classified and also fall on the test example's decision boundary, brings +the most performance gain. + +
+
+ comment: 13 pages in total +
+
+
+
+
+ + ☆ NineRec: A Benchmark Dataset Suite for Evaluating Transferable + Recommendation + + +
+ Learning a recommender system model from an item's raw modality features +(such as image, text, audio, etc.), called MoRec, has attracted growing +interest recently. One key advantage of MoRec is that it can easily benefit +from advances in other fields, such as natural language processing (NLP) and +computer vision (CV). Moreover, it naturally supports transfer learning across +different systems through modality features, known as transferable recommender +systems, or TransRec. + However, so far, TransRec has made little progress, compared to +groundbreaking foundation models in the fields of NLP and CV. The lack of +large-scale, high-quality recommendation datasets poses a major obstacle. To +this end, we introduce NineRec, a TransRec dataset suite that includes a +large-scale source domain recommendation dataset and nine diverse target domain +recommendation datasets. Each item in NineRec is represented by a text +description and a high-resolution cover image. With NineRec, we can implement +TransRec models in an end-to-end training manner instead of using pre-extracted +invariant features. We conduct a benchmark study and empirical analysis of +TransRec using NineRec, and our findings provide several valuable insights. To +support further research, we make our code, datasets, benchmarks, and +leaderboards publicly available at +https://github.com/anonymous?ninerec/NineRec. + +
+
+
+
+
+ + ☆ A Conversation is Worth A Thousand Recommendations: A Survey of Holistic + Conversational Recommender Systems RecSys 2023 + + +
+ Conversational recommender systems (CRS) generate recommendations through an +interactive process. However, not all CRS approaches use human conversations as +their source of interaction data; the majority of prior CRS work simulates +interactions by exchanging entity-level information. As a result, claims of +prior CRS work do not generalise to real-world settings where conversations +take unexpected turns, or where conversational and intent understanding is not +perfect. To tackle this challenge, the research community has started to +examine holistic CRS, which are trained using conversational data collected +from real-world scenarios. Despite their emergence, such holistic approaches +are under-explored. + We present a comprehensive survey of holistic CRS methods by summarizing the +literature in a structured manner. Our survey recognises holistic CRS +approaches as having three components: 1) a backbone language model, the +optional use of 2) external knowledge, and/or 3) external guidance. We also +give a detailed analysis of CRS datasets and evaluation methods in real +application scenarios. We offer our insight as to the current challenges of +holistic CRS and possible future trends. + +
+
+ comment: Accepted by 5th KaRS Workshop @ ACM RecSys 2023, 8 pages +
+
+
+
+
+ + ☆ Feature Engineering in Learning-to-Rank for Community Question Answering + Task + + +
+ Community question answering (CQA) forums are Internet-based platforms where +users ask questions about a topic and other expert users try to provide +solutions. Many CQA forums such as Quora, Stackoverflow, Yahoo!Answer, +StackExchange exist with a lot of user-generated data. These data are leveraged +in automated CQA ranking systems where similar questions (and answers) are +presented in response to the query of the user. In this work, we empirically +investigate a few aspects of this domain. Firstly, in addition to traditional +features like TF-IDF, BM25 etc., we introduce a BERT-based feature that +captures the semantic similarity between the question and answer. Secondly, +most of the existing research works have focused on features extracted only +from the question part; features extracted from answers have not been explored +extensively. We combine both types of features in a linear fashion. Thirdly, +using our proposed concepts, we conduct an empirical investigation with +different rank-learning algorithms, some of which have not been used so far in +CQA domain. On three standard CQA datasets, our proposed framework achieves +state-of-the-art performance. We also analyze importance of the features we use +in our investigation. This work is expected to guide the practitioners to +select a better set of features for the CQA retrieval task. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Zero-shot Audio Topic Reranking using Large Language Models + + +
+ The Multimodal Video Search by Examples (MVSE) project investigates using +video clips as the query term for information retrieval, rather than the more +traditional text query. This enables far richer search modalities such as +images, speaker, content, topic, and emotion. A key element for this process is +highly rapid, flexible, search to support large archives, which in MVSE is +facilitated by representing video attributes by embeddings. This work aims to +mitigate any performance loss from this rapid archive search by examining +reranking approaches. In particular, zero-shot reranking methods using large +language models are investigated as these are applicable to any video archive +audio content. Performance is evaluated for topic-based retrieval on a publicly +available video archive, the BBC Rewind corpus. Results demonstrate that +reranking can achieve improved retrieval ranking without the need for any +task-specific training data. + +
+
+
+
+
+ + ☆ Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec? + + +
+ Recently sequential recommendations and next-item prediction task has become +increasingly popular in the field of recommender systems. Currently, two +state-of-the-art baselines are Transformer-based models SASRec and BERT4Rec. +Over the past few years, there have been quite a few publications comparing +these two algorithms and proposing new state-of-the-art models. In most of the +publications, BERT4Rec achieves better performance than SASRec. But BERT4Rec +uses cross-entropy over softmax for all items, while SASRec uses negative +sampling and calculates binary cross-entropy loss for one positive and one +negative item. In our work, we show that if both models are trained with the +same loss, which is used by BERT4Rec, then SASRec will significantly outperform +BERT4Rec both in terms of quality and training speed. In addition, we show that +SASRec could be effectively trained with negative sampling and still outperform +BERT4Rec, but the number of negative examples should be much larger than one. + +
+
+
+
+
+ + ☆ C-Pack: Packaged Resources To Advance General Chinese Embedding + + +
+ We introduce C-Pack, a package of resources that significantly advance the +field of general Chinese embeddings. C-Pack includes three critical resources. +1) C-MTEB is a comprehensive benchmark for Chinese text embeddings covering 6 +tasks and 35 datasets. 2) C-MTP is a massive text embedding dataset curated +from labeled and unlabeled Chinese corpora for training embedding models. 3) +C-TEM is a family of embedding models covering multiple sizes. Our models +outperform all prior Chinese text embeddings on C-MTEB by up to +10% upon the +time of the release. We also integrate and optimize the entire suite of +training methods for C-TEM. Along with our resources on general Chinese +embedding, we release our data and models for English text embeddings. The +English models achieve state-of-the-art performance on MTEB benchmark; +meanwhile, our released English data is 2 times larger than the Chinese data. +All these resources are made publicly available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+
+
+
+ + ☆ Neuro-Symbolic Recommendation Model based on Logic Query + + +
+ A recommendation system assists users in finding items that are relevant to +them. Existing recommendation models are primarily based on predicting +relationships between users and items and use complex matching models or +incorporate extensive external information to capture association patterns in +data. However, recommendation is not only a problem of inductive statistics +using data; it is also a cognitive task of reasoning decisions based on +knowledge extracted from information. Hence, a logic system could naturally be +incorporated for the reasoning in a recommendation task. However, although +hard-rule approaches based on logic systems can provide powerful reasoning +ability, they struggle to cope with inconsistent and incomplete knowledge in +real-world tasks, especially for complex tasks such as recommendation. +Therefore, in this paper, we propose a neuro-symbolic recommendation model, +which transforms the user history interactions into a logic expression and then +transforms the recommendation prediction into a query task based on this logic +expression. The logic expressions are then computed based on the modular logic +operations of the neural network. We also construct an implicit logic encoder +to reasonably reduce the complexity of the logic computation. Finally, a user's +interest items can be queried in the vector space based on the computation +results. Experiments on three well-known datasets verified that our method +performs better compared to state of the art shallow, deep, session, and +reasoning models. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ☆ MMEAD: MS MARCO Entity Annotations and Disambiguations + + +
+ MMEAD, or MS MARCO Entity Annotations and Disambiguations, is a resource for +entity links for the MS MARCO datasets. We specify a format to store and share +links for both document and passage collections of MS MARCO. Following this +specification, we release entity links to Wikipedia for documents and passages +in both MS MARCO collections (v1 and v2). Entity links have been produced by +the REL and BLINK systems. MMEAD is an easy-to-install Python package, allowing +users to load the link data and entity embeddings effortlessly. Using MMEAD +takes only a few lines of code. Finally, we show how MMEAD can be used for IR +research that uses entity information. We show how to improve recall@1000 and +MRR@10 on more complex queries on the MS MARCO v1 passage dataset by using this +resource. We also demonstrate how entity expansions can be used for interactive +search applications. + +
+
+
+
+
+ + ♻ ☆ DisenPOI: Disentangling Sequential and Geographical Influence for + Point-of-Interest Recommendation WSDM'23 + + +
+ Point-of-Interest (POI) recommendation plays a vital role in various +location-aware services. It has been observed that POI recommendation is driven +by both sequential and geographical influences. However, since there is no +annotated label of the dominant influence during recommendation, existing +methods tend to entangle these two influences, which may lead to sub-optimal +recommendation performance and poor interpretability. In this paper, we address +the above challenge by proposing DisenPOI, a novel Disentangled dual-graph +framework for POI recommendation, which jointly utilizes sequential and +geographical relationships on two separate graphs and disentangles the two +influences with self-supervision. The key novelty of our model compared with +existing approaches is to extract disentangled representations of both +sequential and geographical influences with contrastive learning. To be +specific, we construct a geographical graph and a sequential graph based on the +check-in sequence of a user. We tailor their propagation schemes to become +sequence-/geo-aware to better capture the corresponding influences. Preference +proxies are extracted from check-in sequence as pseudo labels for the two +influences, which supervise the disentanglement via a contrastive loss. +Extensive experiments on three datasets demonstrate the superiority of the +proposed model. + +
+
+ comment: Accepted by ACM International Conference on Web Search and Data + Mining (WSDM'23) +
+
+
+
+
+ + ♻ ☆ Modern Baselines for SPARQL Semantic Parsing SIGIR 2022 + + +
+ In this work, we focus on the task of generating SPARQL queries from natural +language questions, which can then be executed on Knowledge Graphs (KGs). We +assume that gold entity and relations have been provided, and the remaining +task is to arrange them in the right order along with SPARQL vocabulary, and +input tokens to produce the correct SPARQL query. Pre-trained Language Models +(PLMs) have not been explored in depth on this task so far, so we experiment +with BART, T5 and PGNs (Pointer Generator Networks) with BERT embeddings, +looking for new baselines in the PLM era for this task, on DBpedia and Wikidata +KGs. We show that T5 requires special input tokenisation, but produces state of +the art performance on LC-QuAD 1.0 and LC-QuAD 2.0 datasets, and outperforms +task-specific models from previous works. Moreover, the methods enable semantic +parsing for questions where a part of the input needs to be copied to the +output query, thus enabling a new paradigm in KG semantic parsing. + +
+
+ comment: 5 pages, short paper, SIGIR 2022 +
+
+
+
+
+ + ♻ ☆ LambdaKG: A Library for Pre-trained Language Model-Based Knowledge Graph + Embeddings AACL 2023 + + +
+ Knowledge Graphs (KGs) often have two characteristics: heterogeneous graph +structure and text-rich entity/relation information. Text-based KG embeddings +can represent entities by encoding descriptions with pre-trained language +models, but no open-sourced library is specifically designed for KGs with PLMs +at present. In this paper, we present LambdaKG, a library for KGE that equips +with many pre-trained language models (e.g., BERT, BART, T5, GPT-3), and +supports various tasks (e.g., knowledge graph completion, question answering, +recommendation, and knowledge probing). LambdaKG is publicly open-sourced at +https://github.com/zjunlp/PromptKG/tree/main/lambdaKG, with a demo video at +http://deepke.zjukg.cn/lambdakg.mp4 and long-term maintenance. + +
+
+ comment: AACL 2023 System Demonstrations, the project website is + https://zjunlp.github.io/project/promptkg/ +
+
+
+
+
+ + ♻ ☆ Exploring Music Genre Classification: Algorithm Analysis and Deployment + Architecture + + +
+ Music genre classification has become increasingly critical with the advent +of various streaming applications. Nowadays, we find it impossible to imagine +using the artist's name and song title to search for music in a sophisticated +music app. It is always difficult to classify music correctly because the +information linked to music, such as region, artist, album, or non-album, is so +variable. This paper presents a study on music genre classification using a +combination of Digital Signal Processing (DSP) and Deep Learning (DL) +techniques. A novel algorithm is proposed that utilizes both DSP and DL methods +to extract relevant features from audio signals and classify them into various +genres. The algorithm was tested on the GTZAN dataset and achieved high +accuracy. An end-to-end deployment architecture is also proposed for +integration into music-related applications. The performance of the algorithm +is analyzed and future directions for improvement are discussed. The proposed +DSP and DL-based music genre classification algorithm and deployment +architecture demonstrate a promising approach for music genre classification. + +
+
+
+
+
+ + ♻ ☆ Reasoning with Language Model Prompting: A Survey ACL 2023 + + +
+ Reasoning, as an essential ability for complex problem-solving, can provide +back-end support for various real-world applications, such as medical +diagnosis, negotiation, etc. This paper provides a comprehensive survey of +cutting-edge research on reasoning with language model prompting. We introduce +research works with comparisons and summaries and provide systematic resources +to help beginners. We also discuss the potential reasons for emerging such +reasoning abilities and highlight future research directions. Resources are +available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated +periodically). + +
+
+ comment: ACL 2023, 24 pages, add references of theoretical analysis +
+
+
+
+
+ + ♻ ☆ CPMR: Context-Aware Incremental Sequential Recommendation with + Pseudo-Multi-Task Learning CIKM 2023 + + +
+ The motivations of users to make interactions can be divided into static +preference and dynamic interest. To accurately model user representations over +time, recent studies in sequential recommendation utilize information +propagation and evolution to mine from batches of arriving interactions. +However, they ignore the fact that people are easily influenced by the recent +actions of other users in the contextual scenario, and applying evolution +across all historical interactions dilutes the importance of recent ones, thus +failing to model the evolution of dynamic interest accurately. To address this +issue, we propose a Context-Aware Pseudo-Multi-Task Recommender System (CPMR) +to model the evolution in both historical and contextual scenarios by creating +three representations for each user and item under different dynamics: static +embedding, historical temporal states, and contextual temporal states. To +dually improve the performance of temporal states evolution and incremental +recommendation, we design a Pseudo-Multi-Task Learning (PMTL) paradigm by +stacking the incremental single-target recommendations into one multi-target +task for joint optimization. Within the PMTL paradigm, CPMR employs a +shared-bottom network to conduct the evolution of temporal states across +historical and contextual scenarios, as well as the fusion of them at the +user-item level. In addition, CPMR incorporates one real tower for incremental +predictions, and two pseudo towers dedicated to updating the respective +temporal states based on new batches of interactions. Experimental results on +four benchmark recommendation datasets show that CPMR consistently outperforms +state-of-the-art baselines and achieves significant gains on three of them. The +code is available at: https://github.com/DiMarzioBian/CPMR. + +
+
+ comment: Accepted by CIKM 2023. Alias: "Modeling Context-Aware Temporal + Dynamics via Pseudo-Multi-Task Learning" +
+
+
+
+
+ + ♻ ☆ A Diffusion model for POI recommendation + + +
+ Next Point-of-Interest (POI) recommendation is a critical task in +location-based services that aim to provide personalized suggestions for the +user's next destination. Previous works on POI recommendation have laid focused +on modeling the user's spatial preference. However, existing works that +leverage spatial information are only based on the aggregation of users' +previous visited positions, which discourages the model from recommending POIs +in novel areas. This trait of position-based methods will harm the model's +performance in many situations. Additionally, incorporating sequential +information into the user's spatial preference remains a challenge. In this +paper, we propose Diff-POI: a Diffusion-based model that samples the user's +spatial preference for the next POI recommendation. Inspired by the wide +application of diffusion algorithm in sampling from distributions, Diff-POI +encodes the user's visiting sequence and spatial character with two +tailor-designed graph encoding modules, followed by a diffusion-based sampling +strategy to explore the user's spatial visiting trends. We leverage the +diffusion process and its reversed form to sample from the posterior +distribution and optimized the corresponding score function. We design a joint +training and inference framework to optimize and evaluate the proposed +Diff-POI. Extensive experiments on four real-world POI recommendation datasets +demonstrate the superiority of our Diff-POI over state-of-the-art baseline +methods. Further ablation and parameter studies on Diff-POI reveal the +functionality and effectiveness of the proposed diffusion-based sampling +strategy for addressing the limitations of existing methods. + +
+
+ comment: Accepted by ACM Transactions on Information Systems (TOIS 2023) +
+
+
+
+
+
+
+
+ + Machine Learning 105 + +
+
+
+ + ☆ Physically Plausible Full-Body Hand-Object Interaction Synthesis + + +
+ We propose a physics-based method for synthesizing dexterous hand-object +interactions in a full-body setting. While recent advancements have addressed +specific facets of human-object interactions, a comprehensive physics-based +approach remains a challenge. Existing methods often focus on isolated segments +of the interaction process and rely on data-driven techniques that may result +in artifacts. In contrast, our proposed method embraces reinforcement learning +(RL) and physics simulation to mitigate the limitations of data-driven +approaches. Through a hierarchical framework, we first learn skill priors for +both body and hand movements in a decoupled setting. The generic skill priors +learn to decode a latent skill embedding into the motion of the underlying +part. A high-level policy then controls hand-object interactions in these +pretrained latent spaces, guided by task objectives of grasping and 3D target +trajectory following. It is trained using a novel reward function that combines +an adversarial style term with a task reward, encouraging natural motions while +fulfilling the task incentives. Our method successfully accomplishes the +complete interaction task, from approaching an object to grasping and +subsequent manipulation. We compare our approach against kinematics-based +baselines and show that it leads to more physically plausible motions. + +
+
+ comment: Project page at https://eth-ait.github.io/phys-fullbody-grasp +
+
+
+
+
+ + ☆ Improving physics-informed DeepONets with hard constraints + + +
+ Current physics-informed (standard or operator) neural networks still rely on +accurately learning the initial conditions of the system they are solving. In +contrast, standard numerical methods evolve such initial conditions without +needing to learn these. In this study, we propose to improve current +physics-informed deep learning strategies such that initial conditions do not +need to be learned and are represented exactly in the predicted solution. +Moreover, this method guarantees that when a DeepONet is applied multiple times +to time step a solution, the resulting function is continuous. + +
+
+ comment: 15 pages, 5 figures, 4 tables; release version +
+
+
+
+
+ + ☆ Choosing a Proxy Metric from Past Experiments + + +
+ In many randomized experiments, the treatment effect of the long-term metric +(i.e. the primary outcome of interest) is often difficult or infeasible to +measure. Such long-term metrics are often slow to react to changes and +sufficiently noisy they are challenging to faithfully estimate in short-horizon +experiments. A common alternative is to measure several short-term proxy +metrics in the hope they closely track the long-term metric -- so they can be +used to effectively guide decision-making in the near-term. We introduce a new +statistical framework to both define and construct an optimal proxy metric for +use in a homogeneous population of randomized experiments. Our procedure first +reduces the construction of an optimal proxy metric in a given experiment to a +portfolio optimization problem which depends on the true latent treatment +effects and noise level of experiment under consideration. We then denoise the +observed treatment effects of the long-term metric and a set of proxies in a +historical corpus of randomized experiments to extract estimates of the latent +treatment effects for use in the optimization problem. One key insight derived +from our approach is that the optimal proxy metric for a given experiment is +not apriori fixed; rather it should depend on the sample size (or effective +noise level) of the randomized experiment for which it is deployed. To +instantiate and evaluate our framework, we employ our methodology in a large +corpus of randomized experiments from an industrial recommendation system and +construct proxy metrics that perform favorably relative to several baselines. + +
+
+
+
+
+ + ☆ A Novel Local-Global Feature Fusion Framework for Body-weight Exercise + Recognition with Pressure Mapping Sensors + + +
+ We present a novel local-global feature fusion framework for body-weight +exercise recognition with floor-based dynamic pressure maps. One step further +from the existing studies using deep neural networks mainly focusing on global +feature extraction, the proposed framework aims to combine local and global +features using image processing techniques and the YOLO object detection to +localize pressure profiles from different body parts and consider physical +constraints. The proposed local feature extraction method generates two sets of +high-level local features consisting of cropped pressure mapping and numerical +features such as angular orientation, location on the mat, and pressure area. +In addition, we adopt a knowledge distillation for regularization to preserve +the knowledge of the global feature extraction and improve the performance of +the exercise recognition. Our experimental results demonstrate a notable 11 +percent improvement in F1 score for exercise recognition while preserving +label-specific features. + +
+
+
+
+
+ + ☆ Some notes concerning a generalized KMM-type optimization method for + density ratio estimation + + +
+ In the present paper we introduce new optimization algorithms for the task of +density ratio estimation. More precisely, we consider extending the well-known +KMM method using the construction of a suitable loss function, in order to +encompass more general situations involving the estimation of density ratio +with respect to subsets of the training data and test data, respectively. The +associated codes can be found at https://github.com/CDAlecsa/Generalized-KMM. + +
+
+ comment: 17 pages, 4 figures +
+
+
+
+
+ + ☆ Beta Diffusion + + +
+ We introduce beta diffusion, a novel generative modeling method that +integrates demasking and denoising to generate data within bounded ranges. +Using scaled and shifted beta distributions, beta diffusion utilizes +multiplicative transitions over time to create both forward and reverse +diffusion processes, maintaining beta distributions in both the forward +marginals and the reverse conditionals, given the data at any point in time. +Unlike traditional diffusion-based generative models relying on additive +Gaussian noise and reweighted evidence lower bounds (ELBOs), beta diffusion is +multiplicative and optimized with KL-divergence upper bounds (KLUBs) derived +from the convexity of the KL divergence. We demonstrate that the proposed KLUBs +are more effective for optimizing beta diffusion compared to negative ELBOs, +which can also be derived as the KLUBs of the same KL divergence with its two +arguments swapped. The loss function of beta diffusion, expressed in terms of +Bregman divergence, further supports the efficacy of KLUBs for optimization. +Experimental results on both synthetic data and natural images demonstrate the +unique capabilities of beta diffusion in generative modeling of range-bounded +data and validate the effectiveness of KLUBs in optimizing diffusion models, +thereby making them valuable additions to the family of diffusion-based +generative models and the optimization techniques used to train them. + +
+
+
+
+
+ + ☆ Identifying the Group-Theoretic Structure of Machine-Learned Symmetries + + +
+ Deep learning was recently successfully used in deriving symmetry +transformations that preserve important physics quantities. Being completely +agnostic, these techniques postpone the identification of the discovered +symmetries to a later stage. In this letter we propose methods for examining +and identifying the group-theoretic structure of such machine-learned +symmetries. We design loss functions which probe the subalgebra structure +either during the deep learning stage of symmetry discovery or in a subsequent +post-processing stage. We illustrate the new methods with examples from the +U(n) Lie group family, obtaining the respective subalgebra decompositions. As +an application to particle physics, we demonstrate the identification of the +residual symmetries after the spontaneous breaking of non-Abelian gauge +symmetries like SU(3) and SU(5) which are commonly used in model building. + +
+
+ comment: 10 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ Learning to Warm-Start Fixed-Point Optimization Algorithms + + +
+ We introduce a machine-learning framework to warm-start fixed-point +optimization algorithms. Our architecture consists of a neural network mapping +problem parameters to warm starts, followed by a predefined number of +fixed-point iterations. We propose two loss functions designed to either +minimize the fixed-point residual or the distance to a ground truth solution. +In this way, the neural network predicts warm starts with the end-to-end goal +of minimizing the downstream loss. An important feature of our architecture is +its flexibility, in that it can predict a warm start for fixed-point algorithms +run for any number of steps, without being limited to the number of steps it +has been trained on. We provide PAC-Bayes generalization bounds on unseen data +for common classes of fixed-point operators: contractive, linearly convergent, +and averaged. Applying this framework to well-known applications in control, +statistics, and signal processing, we observe a significant reduction in the +number of iterations and solution time required to solve these problems, +through learned warm starts. + +
+
+
+
+
+ + ☆ Directed Scattering for Knowledge Graph-based Cellular Signaling + Analysis + + +
+ Directed graphs are a natural model for many phenomena, in particular +scientific knowledge graphs such as molecular interaction or chemical reaction +networks that define cellular signaling relationships. In these situations, +source nodes typically have distinct biophysical properties from sinks. Due to +their ordered and unidirectional relationships, many such networks also have +hierarchical and multiscale structure. However, the majority of methods +performing node- and edge-level tasks in machine learning do not take these +properties into account, and thus have not been leveraged effectively for +scientific tasks such as cellular signaling network inference. We propose a new +framework called Directed Scattering Autoencoder (DSAE) which uses a directed +version of a geometric scattering transform, combined with the non-linear +dimensionality reduction properties of an autoencoder and the geometric +properties of the hyperbolic space to learn latent hierarchies. We show this +method outperforms numerous others on tasks such as embedding directed graphs +and learning cellular signaling networks. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Text Classification of Cancer Clinical Trial Eligibility Criteria + + +
+ Automatic identification of clinical trials for which a patient is eligible +is complicated by the fact that trial eligibility is stated in natural +language. A potential solution to this problem is to employ text classification +methods for common types of eligibility criteria. In this study, we focus on +seven common exclusion criteria in cancer trials: prior malignancy, human +immunodeficiency virus, hepatitis B, hepatitis C, psychiatric illness, +drug/substance abuse, and autoimmune illness. Our dataset consists of 764 phase +III cancer trials with these exclusions annotated at the trial level. We +experiment with common transformer models as well as a new pre-trained clinical +trial BERT model. Our results demonstrate the feasibility of automatically +classifying common exclusion criteria. Additionally, we demonstrate the value +of a pre-trained language model specifically for clinical trials, which yields +the highest average performance across all criteria. + +
+
+ comment: AMIA Annual Symposium Proceedings 2023 +
+
+
+
+
+ + ☆ Communication Efficient Private Federated Learning Using Dithering + + +
+ The task of preserving privacy while ensuring efficient communication is a +fundamental challenge in federated learning. In this work, we tackle this +challenge in the trusted aggregator model, and propose a solution that achieves +both objectives simultaneously. We show that employing a quantization scheme +based on subtractive dithering at the clients can effectively replicate the +normal noise addition process at the aggregator. This implies that we can +guarantee the same level of differential privacy against other clients while +substantially reducing the amount of communication required, as opposed to +transmitting full precision gradients and using central noise addition. We also +experimentally demonstrate that the accuracy of our proposed approach matches +that of the full precision gradient method. + +
+
+
+
+
+ + ☆ What Matters to Enhance Traffic Rule Compliance of Imitation Learning + for Automated Driving + + +
+ More research attention has recently been given to end-to-end autonomous +driving technologies where the entire driving pipeline is replaced with a +single neural network because of its simpler structure and faster inference +time. Despite this appealing approach largely reducing the components in +driving pipeline, its simplicity also leads to interpretability problems and +safety issues arXiv:2003.06404. The trained policy is not always compliant with +the traffic rules and it is also hard to discover the reason for the +misbehavior because of the lack of intermediate outputs. Meanwhile, Sensors are +also critical to autonomous driving's security and feasibility to perceive the +surrounding environment under complex driving scenarios. In this paper, we +proposed P-CSG, a novel penalty-based imitation learning approach with cross +semantics generation sensor fusion technologies to increase the overall +performance of End-to-End Autonomous Driving. We conducted an assessment of our +model's performance using the Town 05 Long benchmark, achieving an impressive +driving score improvement of over 15%. Furthermore, we conducted robustness +evaluations against adversarial attacks like FGSM and Dot attacks, revealing a +substantial increase in robustness compared to baseline models.More detailed +information, such as code-based resources, ablation studies and videos can be +found at https://hk-zh.github.io/p-csg-plus. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Improving Multimodal Classification of Social Media Posts by Leveraging + Image-Text Auxiliary tasks + + +
+ Effectively leveraging multimodal information from social media posts is +essential to various downstream tasks such as sentiment analysis, sarcasm +detection and hate speech classification. However, combining text and image +information is challenging because of the idiosyncratic cross-modal semantics +with hidden or complementary information present in matching image-text pairs. +In this work, we aim to directly model this by proposing the use of two +auxiliary losses jointly with the main task when fine-tuning any pre-trained +multimodal model. Image-Text Contrastive (ITC) brings image-text +representations of a post closer together and separates them from different +posts, capturing underlying dependencies. Image-Text Matching (ITM) facilitates +the understanding of semantic correspondence between images and text by +penalizing unrelated pairs. We combine these objectives with five multimodal +models, demonstrating consistent improvements across four popular social media +datasets. Furthermore, through detailed analysis, we shed light on the specific +scenarios and cases where each auxiliary task proves to be most effective. + +
+
+
+
+
+ + ☆ Virchow: A Million-Slide Digital Pathology Foundation Model + + +
+ Computational pathology uses artificial intelligence to enable precision +medicine and decision support systems through the analysis of whole slide +images. It has the potential to revolutionize the diagnosis and treatment of +cancer. However, a major challenge to this objective is that for many specific +computational pathology tasks the amount of data is inadequate for development. +To address this challenge, we created Virchow, a 632 million parameter deep +neural network foundation model for computational pathology. Using +self-supervised learning, Virchow is trained on 1.5 million hematoxylin and +eosin stained whole slide images from diverse tissue groups, which is orders of +magnitude more data than previous works. When evaluated on downstream tasks +including tile-level pan-cancer detection and subtyping and slide-level +biomarker prediction, Virchow outperforms state-of-the-art systems both on +internal datasets drawn from the same population as the pretraining data as +well as external public datasets. Virchow achieves 93% balanced accuracy for +pancancer tile classification, and AUCs of 0.983 for colon microsatellite +instability status prediction and 0.967 for breast CDH1 status prediction. The +gains in performance highlight the importance of pretraining on massive +pathology image datasets, suggesting pretraining on even larger datasets could +continue improving performance for many high-impact applications where limited +amounts of training data are available, such as drug outcome prediction. + +
+
+
+
+
+ + ☆ Variational Quantum Linear Solver enhanced Quantum Support Vector + Machine + + +
+ Quantum Support Vector Machines (QSVM) play a vital role in using quantum +resources for supervised machine learning tasks, such as classification. +However, current methods are strongly limited in terms of scalability on Noisy +Intermediate Scale Quantum (NISQ) devices. In this work, we propose a novel +approach called the Variational Quantum Linear Solver (VQLS) enhanced QSVM. +This is built upon our idea of utilizing the variational quantum linear solver +to solve system of linear equations of a least squares-SVM on a NISQ device. +The implementation of our approach is evaluated by an extensive series of +numerical experiments with the Iris dataset, which consists of three distinct +iris plant species. Based on this, we explore the practicality and +effectiveness of our algorithm by constructing a classifier capable of +classification in a feature space ranging from one to seven dimensions. +Furthermore, by strategically exploiting both classical and quantum computing +for various subroutines of our algorithm, we effectively mitigate practical +challenges associated with the implementation. These include significant +improvement in the trainability of the variational ansatz and notable +reductions in run-time for cost calculations. Based on the numerical +experiments, our approach exhibits the capability of identifying a separating +hyperplane in an 8-dimensional feature space. Moreover, it consistently +demonstrated strong performance across various instances with the same dataset. + +
+
+
+
+
+ + ☆ PRE: Vision-Language Prompt Learning with Reparameterization Encoder + + +
+ Large pre-trained vision-language models such as CLIP have demonstrated great +potential in zero-shot transferability to downstream tasks. However, to attain +optimal performance, the manual selection of prompts is necessary to improve +alignment between the downstream image distribution and the textual class +descriptions. This manual prompt engineering is the major challenge for +deploying such models in practice since it requires domain expertise and is +extremely time-consuming. To avoid non-trivial prompt engineering, recent work +Context Optimization (CoOp) introduced the concept of prompt learning to the +vision domain using learnable textual tokens. While CoOp can achieve +substantial improvements over manual prompts, its learned context is worse +generalizable to wider unseen classes within the same dataset. In this work, we +present Prompt Learning with Reparameterization Encoder (PRE) - a simple and +efficient method that enhances the generalization ability of the learnable +prompt to unseen classes while maintaining the capacity to learn Base classes. +Instead of directly optimizing the prompts, PRE employs a prompt encoder to +reparameterize the input prompt embeddings, enhancing the exploration of +task-specific knowledge from few-shot samples. Experiments and extensive +ablation studies on 8 benchmarks demonstrate that our approach is an efficient +method for prompt learning. Specifically, PRE achieves a notable enhancement of +5.60% in average accuracy on New classes and 3% in Harmonic mean compared to +CoOp in the 16-shot setting, all achieved within a good training time. + +
+
+ comment: 8 pages excluding References and Appendix +
+
+
+
+
+ + ☆ Interpretability is in the Mind of the Beholder: A Causal Framework for + Human-interpretable Representation Learning + + +
+ Focus in Explainable AI is shifting from explanations defined in terms of +low-level elements, such as input features, to explanations encoded in terms of +interpretable concepts learned from data. How to reliably acquire such concepts +is, however, still fundamentally unclear. An agreed-upon notion of concept +interpretability is missing, with the result that concepts used by both +post-hoc explainers and concept-based neural networks are acquired through a +variety of mutually incompatible strategies. Critically, most of these neglect +the human side of the problem: a representation is understandable only insofar +as it can be understood by the human at the receiving end. The key challenge in +Human-interpretable Representation Learning (HRL) is how to model and +operationalize this human element. In this work, we propose a mathematical +framework for acquiring interpretable representations suitable for both +post-hoc explainers and concept-based neural networks. Our formalization of HRL +builds on recent advances in causal representation learning and explicitly +models a human stakeholder as an external observer. This allows us to derive a +principled notion of alignment between the machine representation and the +vocabulary of concepts understood by the human. In doing so, we link alignment +and interpretability through a simple and intuitive name transfer game, and +clarify the relationship between alignment and a well-known property of +representations, namely disentanglment. We also show that alignment is linked +to the issue of undesirable correlations among concepts, also known as concept +leakage, and to content-style separation, all through a general +information-theoretic reformulation of these properties. Our conceptualization +aims to bridge the gap between the human and algorithmic sides of +interpretability and establish a stepping stone for new research on +human-interpretable representations. + +
+
+
+
+
+ + ☆ Understanding Vector-Valued Neural Networks and Their Relationship with + Real and Hypercomplex-Valued Neural Networks + + +
+ Despite the many successful applications of deep learning models for +multidimensional signal and image processing, most traditional neural networks +process data represented by (multidimensional) arrays of real numbers. The +intercorrelation between feature channels is usually expected to be learned +from the training data, requiring numerous parameters and careful training. In +contrast, vector-valued neural networks are conceived to process arrays of +vectors and naturally consider the intercorrelation between feature channels. +Consequently, they usually have fewer parameters and often undergo more robust +training than traditional neural networks. This paper aims to present a broad +framework for vector-valued neural networks, referred to as V-nets. In this +context, hypercomplex-valued neural networks are regarded as vector-valued +models with additional algebraic properties. Furthermore, this paper explains +the relationship between vector-valued and traditional neural networks. +Precisely, a vector-valued neural network can be obtained by placing +restrictions on a real-valued model to consider the intercorrelation between +feature channels. Finally, we show how V-nets, including hypercomplex-valued +neural networks, can be implemented in current deep-learning libraries as +real-valued networks. + +
+
+
+
+
+ + ☆ Market-GAN: Adding Control to Financial Market Data Generation with + Semantic Context + + +
+ Financial simulators play an important role in enhancing forecasting +accuracy, managing risks, and fostering strategic financial decision-making. +Despite the development of financial market simulation methodologies, existing +frameworks often struggle with adapting to specialized simulation context. We +pinpoint the challenges as i) current financial datasets do not contain context +labels; ii) current techniques are not designed to generate financial data with +context as control, which demands greater precision compared to other +modalities; iii) the inherent difficulties in generating context-aligned, +high-fidelity data given the non-stationary, noisy nature of financial data. To +address these challenges, our contributions are: i) we proposed the Contextual +Market Dataset with market dynamics, stock ticker, and history state as +context, leveraging a market dynamics modeling method that combines linear +regression and Dynamic Time Warping clustering to extract market dynamics; ii) +we present Market-GAN, a novel architecture incorporating a Generative +Adversarial Networks (GAN) for the controllable generation with context, an +autoencoder for learning low-dimension features, and supervisors for knowledge +transfer; iii) we introduce a two-stage training scheme to ensure that +Market-GAN captures the intrinsic market distribution with multiple objectives. +In the pertaining stage, with the use of the autoencoder and supervisors, we +prepare the generator with a better initialization for the adversarial training +stage. We propose a set of holistic evaluation metrics that consider alignment, +fidelity, data usability on downstream tasks, and market facts. We evaluate +Market-GAN with the Dow Jones Industrial Average data from 2000 to 2023 and +showcase superior performance in comparison to 4 state-of-the-art time-series +generative models. + +
+
+
+
+
+ + ☆ Causal Entropy and Information Gain for Measuring Causal Control ECAI 2023 + + +
+ Artificial intelligence models and methods commonly lack causal +interpretability. Despite the advancements in interpretable machine learning +(IML) methods, they frequently assign importance to features which lack causal +influence on the outcome variable. Selecting causally relevant features among +those identified as relevant by these methods, or even before model training, +would offer a solution. Feature selection methods utilizing information +theoretical quantities have been successful in identifying statistically +relevant features. However, the information theoretical quantities they are +based on do not incorporate causality, rendering them unsuitable for such +scenarios. To address this challenge, this article proposes information +theoretical quantities that incorporate the causal structure of the system, +which can be used to evaluate causal importance of features for some given +outcome variable. Specifically, we introduce causal versions of entropy and +mutual information, termed causal entropy and causal information gain, which +are designed to assess how much control a feature provides over the outcome +variable. These newly defined quantities capture changes in the entropy of a +variable resulting from interventions on other variables. Fundamental results +connecting these quantities to the existence of causal effects are derived. The +use of causal information gain in feature selection is demonstrated, +highlighting its superiority over standard mutual information in revealing +which features provide control over a chosen outcome variable. Our +investigation paves the way for the development of methods with improved +interpretability in domains involving causation. + +
+
+ comment: 16 pages. Accepted at the third XI-ML workshop of ECAI 2023. To + appear in the Springer CCIS book series +
+
+
+
+
+ + ☆ Tree of Uncertain Thoughts Reasoning for Large Language Models + + +
+ While the recently introduced Tree of Thoughts (ToT) has heralded +advancements in allowing Large Language Models (LLMs) to reason through +foresight and backtracking for global decision-making, it has overlooked the +inherent local uncertainties in intermediate decision points or "thoughts". +These local uncertainties, intrinsic to LLMs given their potential for diverse +responses, remain a significant concern in the reasoning process. Addressing +this pivotal gap, we introduce the Tree of Uncertain Thoughts (TouT) - a +reasoning framework tailored for LLMs. Our TouT effectively leverages Monte +Carlo Dropout to quantify uncertainty scores associated with LLMs' diverse +local responses at these intermediate steps. By marrying this local uncertainty +quantification with global search algorithms, TouT enhances the model's +precision in response generation. We substantiate our approach with rigorous +experiments on two demanding planning tasks: Game of 24 and Mini Crosswords. +The empirical evidence underscores TouT's superiority over both ToT and +chain-of-thought prompting methods. + +
+
+
+
+
+ + ☆ A DenseNet-based method for decoding auditory spatial attention with EEG + + +
+ Auditory spatial attention detection (ASAD) aims to decode the attended +spatial location with EEG in a multiple-speaker setting. ASAD methods are +inspired by the brain lateralization of cortical neural responses during the +processing of auditory spatial attention, and show promising performance for +the task of auditory attention decoding (AAD) with neural recordings. In the +previous ASAD methods, the spatial distribution of EEG electrodes is not fully +exploited, which may limit the performance of these methods. In the present +work, by transforming the original EEG channels into a two-dimensional (2D) +spatial topological map, the EEG data is transformed into a three-dimensional +(3D) arrangement containing spatial-temporal information. And then a 3D deep +convolutional neural network (DenseNet-3D) is used to extract temporal and +spatial features of the neural representation for the attended locations. The +results show that the proposed method achieves higher decoding accuracy than +the state-of-the-art (SOTA) method (94.4% compared to XANet's 90.6%) with +1-second decision window for the widely used KULeuven (KUL) dataset, and the +code to implement our work is available on Github: + https://github.com/xuxiran/ASAD_DenseNet + +
+
+
+
+
+ + ☆ deepFDEnet: A Novel Neural Network Architecture for Solving Fractional + Differential Equations + + +
+ The primary goal of this research is to propose a novel architecture for a +deep neural network that can solve fractional differential equations +accurately. A Gaussian integration rule and a $L_1$ discretization technique +are used in the proposed design. In each equation, a deep neural network is +used to approximate the unknown function. Three forms of fractional +differential equations have been examined to highlight the method's +versatility: a fractional ordinary differential equation, a fractional order +integrodifferential equation, and a fractional order partial differential +equation. The results show that the proposed architecture solves different +forms of fractional differential equations with excellent precision. + +
+
+
+
+
+ + ☆ Benchmarking machine learning models for quantum state classification + + +
+ Quantum computing is a growing field where the information is processed by +two-levels quantum states known as qubits. Current physical realizations of +qubits require a careful calibration, composed by different experiments, due to +noise and decoherence phenomena. Among the different characterization +experiments, a crucial step is to develop a model to classify the measured +state by discriminating the ground state from the excited state. In this +proceedings we benchmark multiple classification techniques applied to real +quantum devices. + +
+
+ comment: 9 pages, 3 figures, CHEP2023 proceedings +
+
+
+
+
+ + ☆ Goal Space Abstraction in Hierarchical Reinforcement Learning via + Set-Based Reachability Analysis + + +
+ Open-ended learning benefits immensely from the use of symbolic methods for +goal representation as they offer ways to structure knowledge for efficient and +transferable learning. However, the existing Hierarchical Reinforcement +Learning (HRL) approaches relying on symbolic reasoning are often limited as +they require a manual goal representation. The challenge in autonomously +discovering a symbolic goal representation is that it must preserve critical +information, such as the environment dynamics. In this paper, we propose a +developmental mechanism for goal discovery via an emergent representation that +abstracts (i.e., groups together) sets of environment states that have similar +roles in the task. We introduce a Feudal HRL algorithm that concurrently learns +both the goal representation and a hierarchical policy. The algorithm uses +symbolic reachability analysis for neural networks to approximate the +transition relation among sets of states and to refine the goal representation. +We evaluate our approach on complex navigation tasks, showing the learned +representation is interpretable, transferrable and results in data efficient +learning. + +
+
+
+
+
+ + ☆ Physics-constrained robust learning of open-form PDEs from limited and + noisy data + + +
+ Unveiling the underlying governing equations of nonlinear dynamic systems +remains a significant challenge, especially when encountering noisy +observations and no prior knowledge available. This study proposes R-DISCOVER, +a framework designed to robustly uncover open-form partial differential +equations (PDEs) from limited and noisy data. The framework operates through +two alternating update processes: discovering and embedding. The discovering +phase employs symbolic representation and a reinforcement learning (RL)-guided +hybrid PDE generator to efficiently produce diverse open-form PDEs with tree +structures. A neural network-based predictive model fits the system response +and serves as the reward evaluator for the generated PDEs. PDEs with superior +fits are utilized to iteratively optimize the generator via the RL method and +the best-performing PDE is selected by a parameter-free stability metric. The +embedding phase integrates the initially identified PDE from the discovering +process as a physical constraint into the predictive model for robust training. +The traversal of PDE trees automates the construction of the computational +graph and the embedding process without human intervention. Numerical +experiments demonstrate our framework's capability to uncover governing +equations from nonlinear dynamic systems with limited and highly noisy data and +outperform other physics-informed neural network-based discovery methods. This +work opens new potential for exploring real-world systems with limited +understanding. + +
+
+
+
+
+ + ☆ Federated Dataset Dictionary Learning for Multi-Source Domain Adaptation + + +
+ In this article, we propose an approach for federated domain adaptation, a +setting where distributional shift exists among clients and some have unlabeled +data. The proposed framework, FedDaDiL, tackles the resulting challenge through +dictionary learning of empirical distributions. In our setting, clients' +distributions represent particular domains, and FedDaDiL collectively trains a +federated dictionary of empirical distributions. In particular, we build upon +the Dataset Dictionary Learning framework by designing collaborative +communication protocols and aggregation operations. The chosen protocols keep +clients' data private, thus enhancing overall privacy compared to its +centralized counterpart. We empirically demonstrate that our approach +successfully generates labeled data on the target domain with extensive +experiments on (i) Caltech-Office, (ii) TEP, and (iii) CWRU benchmarks. +Furthermore, we compare our method to its centralized counterpart and other +benchmarks in federated domain adaptation. + +
+
+ comment: 7 pages,2 figures +
+
+
+
+
+ + ☆ Multi-Source Domain Adaptation meets Dataset Distillation through + Dataset Dictionary Learning + + +
+ In this paper, we consider the intersection of two problems in machine +learning: Multi-Source Domain Adaptation (MSDA) and Dataset Distillation (DD). +On the one hand, the first considers adapting multiple heterogeneous labeled +source domains to an unlabeled target domain. On the other hand, the second +attacks the problem of synthesizing a small summary containing all the +information about the datasets. We thus consider a new problem called MSDA-DD. +To solve it, we adapt previous works in the MSDA literature, such as +Wasserstein Barycenter Transport and Dataset Dictionary Learning, as well as DD +method Distribution Matching. We thoroughly experiment with this novel problem +on four benchmarks (Caltech-Office 10, Tennessee-Eastman Process, Continuous +Stirred Tank Reactor, and Case Western Reserve University), where we show that, +even with as little as 1 sample per class, one achieves state-of-the-art +adaptation performance. + +
+
+ comment: 7 pages,4 figures +
+
+
+
+
+ + ☆ Dataset Size Dependence of Rate-Distortion Curve and Threshold of + Posterior Collapse in Linear VAE + + +
+ In the Variational Autoencoder (VAE), the variational posterior often aligns +closely with the prior, which is known as posterior collapse and hinders the +quality of representation learning. To mitigate this problem, an adjustable +hyperparameter beta has been introduced in the VAE. This paper presents a +closed-form expression to assess the relationship between the beta in VAE, the +dataset size, the posterior collapse, and the rate-distortion curve by +analyzing a minimal VAE in a high-dimensional limit. These results clarify that +a long plateau in the generalization error emerges with a relatively larger +beta. As the beta increases, the length of the plateau extends and then becomes +infinite beyond a certain beta threshold. This implies that the choice of beta, +unlike the usual regularization parameters, can induce posterior collapse +regardless of the dataset size. Thus, beta is a risky parameter that requires +careful tuning. Furthermore, considering the dataset-size dependence on the +rate-distortion curve, a relatively large dataset is required to obtain a +rate-distortion curve with high rates. Extensive numerical experiments support +our analysis. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ Feature Engineering in Learning-to-Rank for Community Question Answering + Task + + +
+ Community question answering (CQA) forums are Internet-based platforms where +users ask questions about a topic and other expert users try to provide +solutions. Many CQA forums such as Quora, Stackoverflow, Yahoo!Answer, +StackExchange exist with a lot of user-generated data. These data are leveraged +in automated CQA ranking systems where similar questions (and answers) are +presented in response to the query of the user. In this work, we empirically +investigate a few aspects of this domain. Firstly, in addition to traditional +features like TF-IDF, BM25 etc., we introduce a BERT-based feature that +captures the semantic similarity between the question and answer. Secondly, +most of the existing research works have focused on features extracted only +from the question part; features extracted from answers have not been explored +extensively. We combine both types of features in a linear fashion. Thirdly, +using our proposed concepts, we conduct an empirical investigation with +different rank-learning algorithms, some of which have not been used so far in +CQA domain. On three standard CQA datasets, our proposed framework achieves +state-of-the-art performance. We also analyze importance of the features we use +in our investigation. This work is expected to guide the practitioners to +select a better set of features for the CQA retrieval task. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Learning Quasi-Static 3D Models of Markerless Deformable Linear Objects + for Bimanual Robotic Manipulation + + +
+ The robotic manipulation of Deformable Linear Objects (DLOs) is a vital and +challenging task that is important in many practical applications. Classical +model-based approaches to this problem require an accurate model to capture how +robot motions affect the deformation of the DLO. Nowadays, data-driven models +offer the best tradeoff between quality and computation time. This paper +analyzes several learning-based 3D models of the DLO and proposes a new one +based on the Transformer architecture that achieves superior accuracy, even on +the DLOs of different lengths, thanks to the proposed scaling method. Moreover, +we introduce a data augmentation technique, which improves the prediction +performance of almost all considered DLO data-driven models. Thanks to this +technique, even a simple Multilayer Perceptron (MLP) achieves close to +state-of-the-art performance while being significantly faster to evaluate. In +the experiments, we compare the performance of the learning-based 3D models of +the DLO on several challenging datasets quantitatively and demonstrate their +applicability in the task of shaping a DLO. + +
+
+ comment: Under review for IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec? + + +
+ Recently sequential recommendations and next-item prediction task has become +increasingly popular in the field of recommender systems. Currently, two +state-of-the-art baselines are Transformer-based models SASRec and BERT4Rec. +Over the past few years, there have been quite a few publications comparing +these two algorithms and proposing new state-of-the-art models. In most of the +publications, BERT4Rec achieves better performance than SASRec. But BERT4Rec +uses cross-entropy over softmax for all items, while SASRec uses negative +sampling and calculates binary cross-entropy loss for one positive and one +negative item. In our work, we show that if both models are trained with the +same loss, which is used by BERT4Rec, then SASRec will significantly outperform +BERT4Rec both in terms of quality and training speed. In addition, we show that +SASRec could be effectively trained with negative sampling and still outperform +BERT4Rec, but the number of negative examples should be much larger than one. + +
+
+
+
+
+ + ☆ Detecting Misinformation with LLM-Predicted Credibility Signals and Weak + Supervision + + +
+ Credibility signals represent a wide range of heuristics that are typically +used by journalists and fact-checkers to assess the veracity of online content. +Automating the task of credibility signal extraction, however, is very +challenging as it requires high-accuracy signal-specific extractors to be +trained, while there are currently no sufficiently large datasets annotated +with all credibility signals. This paper investigates whether large language +models (LLMs) can be prompted effectively with a set of 18 credibility signals +to produce weak labels for each signal. We then aggregate these potentially +noisy labels using weak supervision in order to predict content veracity. We +demonstrate that our approach, which combines zero-shot LLM credibility signal +labeling and weak supervision, outperforms state-of-the-art classifiers on two +misinformation datasets without using any ground-truth labels for training. We +also analyse the contribution of the individual credibility signals towards +predicting content veracity, which provides new valuable insights into their +role in misinformation detection. + +
+
+
+
+
+ + ☆ Statistically Valid Variable Importance Assessment through Conditional + Permutations + + +
+ Variable importance assessment has become a crucial step in machine-learning +applications when using complex learners, such as deep neural networks, on +large-scale data. Removal-based importance assessment is currently the +reference approach, particularly when statistical guarantees are sought to +justify variable inclusion. It is often implemented with variable permutation +schemes. On the flip side, these approaches risk misidentifying unimportant +variables as important in the presence of correlations among covariates. Here +we develop a systematic approach for studying Conditional Permutation +Importance (CPI) that is model agnostic and computationally lean, as well as +reusable benchmarks of state-of-the-art variable importance estimators. We show +theoretically and empirically that $\textit{CPI}$ overcomes the limitations of +standard permutation importance by providing accurate type-I error control. +When used with a deep neural network, $\textit{CPI}$ consistently showed top +accuracy across benchmarks. An empirical benchmark on real-world data analysis +in a large-scale medical dataset showed that $\textit{CPI}$ provides a more +parsimonious selection of statistically significant variables. Our results +suggest that $\textit{CPI}$ can be readily used as drop-in replacement for +permutation-based methods. + +
+
+
+
+
+ + ☆ Structure-Preserving Transformers for Sequences of SPD Matrices ICASSP 2024 + + +
+ In recent years, Transformer-based auto-attention mechanisms have been +successfully applied to the analysis of a variety of context-reliant data +types, from texts to images and beyond, including data from non-Euclidean +geometries. In this paper, we present such a mechanism, designed to classify +sequences of Symmetric Positive Definite matrices while preserving their +Riemannian geometry throughout the analysis. We apply our method to automatic +sleep staging on timeseries of EEG-derived covariance matrices from a standard +dataset, obtaining high levels of stage-wise performance. + +
+
+ comment: Submitted to the ICASSP 2024 Conference +
+
+
+
+
+ + ☆ Equivariant Data Augmentation for Generalization in Offline + Reinforcement Learning + + +
+ We present a novel approach to address the challenge of generalization in +offline reinforcement learning (RL), where the agent learns from a fixed +dataset without any additional interaction with the environment. Specifically, +we aim to improve the agent's ability to generalize to out-of-distribution +goals. To achieve this, we propose to learn a dynamics model and check if it is +equivariant with respect to a fixed type of transformation, namely translations +in the state space. We then use an entropy regularizer to increase the +equivariant set and augment the dataset with the resulting transformed samples. +Finally, we learn a new policy offline based on the augmented dataset, with an +off-the-shelf offline RL algorithm. Our experimental results demonstrate that +our approach can greatly improve the test performance of the policy on the +considered environments. + +
+
+
+
+
+ + ☆ Naturalistic Robot Arm Trajectory Generation via Representation Learning + + +
+ The integration of manipulator robots in household environments suggests a +need for more predictable and human-like robot motion. This holds especially +true for wheelchair-mounted assistive robots that can support the independence +of people with paralysis. One method of generating naturalistic motion +trajectories is via the imitation of human demonstrators. This paper explores a +self-supervised imitation learning method using an autoregressive +spatio-temporal graph neural network for an assistive drinking task. We address +learning from diverse human motion trajectory data that were captured via +wearable IMU sensors on a human arm as the action-free task demonstrations. +Observed arm motion data from several participants is used to generate natural +and functional drinking motion trajectories for a UR5e robot arm. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ Proximal Bellman mappings for reinforcement learning and their + application to robust adaptive filtering + + +
+ This paper aims at the algorithmic/theoretical core of reinforcement learning +(RL) by introducing the novel class of proximal Bellman mappings. These +mappings are defined in reproducing kernel Hilbert spaces (RKHSs), to benefit +from the rich approximation properties and inner product of RKHSs, they are +shown to belong to the powerful Hilbertian family of (firmly) nonexpansive +mappings, regardless of the values of their discount factors, and possess ample +degrees of design freedom to even reproduce attributes of the classical Bellman +mappings and to pave the way for novel RL designs. An approximate +policy-iteration scheme is built on the proposed class of mappings to solve the +problem of selecting online, at every time instance, the "optimal" exponent $p$ +in a $p$-norm loss to combat outliers in linear adaptive filtering, without +training data and any knowledge on the statistical properties of the outliers. +Numerical tests on synthetic data showcase the superior performance of the +proposed framework over several non-RL and kernel-based RL schemes. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2210.11755 +
+
+
+
+
+ + ☆ VerilogEval: Evaluating Large Language Models for Verilog Code + Generation + + +
+ The increasing popularity of large language models (LLMs) has paved the way +for their application in diverse domains. This paper proposes a benchmarking +framework tailored specifically for evaluating LLM performance in the context +of Verilog code generation for hardware design and verification. We present a +comprehensive evaluation dataset consisting of 156 problems from the Verilog +instructional website HDLBits. The evaluation set consists of a diverse set of +Verilog code generation tasks, ranging from simple combinational circuits to +complex finite state machines. The Verilog code completions can be +automatically tested for functional correctness by comparing the transient +simulation outputs of the generated design with a golden solution. We also +demonstrate that the Verilog code generation capability of pretrained language +models could be improved with supervised fine-tuning by bootstrapping with LLM +generated synthetic problem-code pairs. + +
+
+ comment: ICCAD 2023 Invited Paper +
+
+
+
+
+ + ☆ Adaptive approximation of monotone functions + + +
+ We study the classical problem of approximating a non-decreasing function $f: +\mathcal{X} \to \mathcal{Y}$ in $L^p(\mu)$ norm by sequentially querying its +values, for known compact real intervals $\mathcal{X}$, $\mathcal{Y}$ and a +known probability measure $\mu$ on $\cX$. For any function~$f$ we characterize +the minimum number of evaluations of $f$ that algorithms need to guarantee an +approximation $\hat{f}$ with an $L^p(\mu)$ error below $\epsilon$ after +stopping. Unlike worst-case results that hold uniformly over all $f$, our +complexity measure is dependent on each specific function $f$. To address this +problem, we introduce GreedyBox, a generalization of an algorithm originally +proposed by Novak (1992) for numerical integration. We prove that GreedyBox +achieves an optimal sample complexity for any function $f$, up to logarithmic +factors. Additionally, we uncover results regarding piecewise-smooth functions. +Perhaps as expected, the $L^p(\mu)$ error of GreedyBox decreases much faster +for piecewise-$C^2$ functions than predicted by the algorithm (without any +knowledge on the smoothness of $f$). A simple modification even achieves +optimal minimax approximation rates for such functions, which we compute +explicitly. In particular, our findings highlight multiple performance gaps +between adaptive and non-adaptive algorithms, smooth and piecewise-smooth +functions, as well as monotone or non-monotone functions. Finally, we provide +numerical experiments to support our theoretical results. + +
+
+
+
+
+ + ☆ Learning Beyond Similarities: Incorporating Dissimilarities between + Positive Pairs in Self-Supervised Time Series Learning + + +
+ By identifying similarities between successive inputs, Self-Supervised +Learning (SSL) methods for time series analysis have demonstrated their +effectiveness in encoding the inherent static characteristics of temporal data. +However, an exclusive emphasis on similarities might result in representations +that overlook the dynamic attributes critical for modeling cardiovascular +diseases within a confined subject cohort. Introducing Distilled Encoding +Beyond Similarities (DEBS), this paper pioneers an SSL approach that transcends +mere similarities by integrating dissimilarities among positive pairs. The +framework is applied to electrocardiogram (ECG) signals, leading to a notable +enhancement of +10\% in the detection accuracy of Atrial Fibrillation (AFib) +across diverse subjects. DEBS underscores the potential of attaining a more +refined representation by encoding the dynamic characteristics of time series +data, tapping into dissimilarities during the optimization process. Broadly, +the strategy delineated in this study holds the promise of unearthing novel +avenues for advancing SSL methodologies tailored to temporal data. + +
+
+
+
+
+ + ☆ Massively-Parallel Heat Map Sorting and Applications To Explainable + Clustering + + +
+ Given a set of points labeled with $k$ labels, we introduce the heat map +sorting problem as reordering and merging the points and dimensions while +preserving the clusters (labels). A cluster is preserved if it remains +connected, i.e., if it is not split into several clusters and no two clusters +are merged. + We prove the problem is NP-hard and we give a fixed-parameter algorithm with +a constant number of rounds in the massively parallel computation model, where +each machine has a sublinear memory and the total memory of the machines is +linear. We give an approximation algorithm for a NP-hard special case of the +problem. We empirically compare our algorithm with k-means and density-based +clustering (DBSCAN) using a dimensionality reduction via locality-sensitive +hashing on several directed and undirected graphs of email and computer +networks. + +
+
+
+
+
+ + ☆ Improved Auto-Encoding using Deterministic Projected Belief Networks + + +
+ In this paper, we exploit the unique properties of a deterministic projected +belief network (D-PBN) to take full advantage of trainable compound activation +functions (TCAs). A D-PBN is a type of auto-encoder that operates by "backing +up" through a feed-forward neural network. TCAs are activation functions with +complex monotonic-increasing shapes that change the distribution of the data so +that the linear transformation that follows is more effective. Because a D-PBN +operates by "backing up", the TCAs are inverted in the reconstruction process, +restoring the original distribution of the data, thus taking advantage of a +given TCA in both analysis and reconstruction. In this paper, we show that a +D-PBN auto-encoder with TCAs can significantly out-perform standard +auto-encoders including variational auto-encoders. + +
+
+
+
+
+ + ☆ Direct Text to Speech Translation System using Acoustic Units + + +
+ This paper proposes a direct text to speech translation system using discrete +acoustic units. This framework employs text in different source languages as +input to generate speech in the target language without the need for text +transcriptions in this language. Motivated by the success of acoustic units in +previous works for direct speech to speech translation systems, we use the same +pipeline to extract the acoustic units using a speech encoder combined with a +clustering algorithm. Once units are obtained, an encoder-decoder architecture +is trained to predict them. Then a vocoder generates speech from units. Our +approach for direct text to speech translation was tested on the new CVSS +corpus with two different text mBART models employed as initialisation. The +systems presented report competitive performance for most of the language pairs +evaluated. Besides, results show a remarkable improvement when initialising our +proposed architecture with a model pre-trained with more languages. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Detecting Unknown Attacks in IoT Environments: An Open Set Classifier + for Enhanced Network Intrusion Detection + + +
+ The widespread integration of Internet of Things (IoT) devices across all +facets of life has ushered in an era of interconnectedness, creating new +avenues for cybersecurity challenges and underscoring the need for robust +intrusion detection systems. However, traditional security systems are designed +with a closed-world perspective and often face challenges in dealing with the +ever-evolving threat landscape, where new and unfamiliar attacks are constantly +emerging. In this paper, we introduce a framework aimed at mitigating the open +set recognition (OSR) problem in the realm of Network Intrusion Detection +Systems (NIDS) tailored for IoT environments. Our framework capitalizes on +image-based representations of packet-level data, extracting spatial and +temporal patterns from network traffic. Additionally, we integrate stacking and +sub-clustering techniques, enabling the identification of unknown attacks by +effectively modeling the complex and diverse nature of benign behavior. The +empirical results prominently underscore the framework's efficacy, boasting an +impressive 88\% detection rate for previously unseen attacks when compared +against existing approaches and recent advancements. Future work will perform +extensive experimentation across various openness levels and attack scenarios, +further strengthening the adaptability and performance of our proposed solution +in safeguarding IoT environments. + +
+
+ comment: 6 Pages, 5 figures +
+
+
+
+
+ + ☆ SC-MAD: Mixtures of Higher-order Networks for Data Augmentation + + +
+ The myriad complex systems with multiway interactions motivate the extension +of graph-based pairwise connections to higher-order relations. In particular, +the simplicial complex has inspired generalizations of graph neural networks +(GNNs) to simplicial complex-based models. Learning on such systems requires +large amounts of data, which can be expensive or impossible to obtain. We +propose data augmentation of simplicial complexes through both linear and +nonlinear mixup mechanisms that return mixtures of existing labeled samples. In +addition to traditional pairwise mixup, we present a convex clustering mixup +approach for a data-driven relationship among several simplicial complexes. We +theoretically demonstrate that the resultant synthetic simplicial complexes +interpolate among existing data with respect to homomorphism densities. Our +method is demonstrated on both synthetic and real-world datasets for simplicial +complex classification. + +
+
+ comment: 5 pages, 1 figure, 1 table +
+
+
+
+
+ + ☆ Is Solving Graph Neural Tangent Kernel Equivalent to Training Graph + Neural Network? + + +
+ A rising trend in theoretical deep learning is to understand why deep +learning works through Neural Tangent Kernel (NTK) [jgh18], a kernel method +that is equivalent to using gradient descent to train a multi-layer +infinitely-wide neural network. NTK is a major step forward in the theoretical +deep learning because it allows researchers to use traditional mathematical +tools to analyze properties of deep neural networks and to explain various +neural network techniques from a theoretical view. A natural extension of NTK +on graph learning is \textit{Graph Neural Tangent Kernel (GNTK)}, and +researchers have already provide GNTK formulation for graph-level regression +and show empirically that this kernel method can achieve similar accuracy as +GNNs on various bioinformatics datasets [dhs+19]. The remaining question now is +whether solving GNTK regression is equivalent to training an infinite-wide +multi-layer GNN using gradient descent. In this paper, we provide three new +theoretical results. First, we formally prove this equivalence for graph-level +regression. Second, we present the first GNTK formulation for node-level +regression. Finally, we prove the equivalence for node-level regression. + +
+
+
+
+
+ + ☆ TensorFlow Chaotic Prediction and Blow Up + + +
+ Predicting the dynamics of chaotic systems is one of the most challenging +tasks for neural networks, and machine learning in general. Here we aim to +predict the spatiotemporal chaotic dynamics of a high-dimensional non-linear +system. In our attempt we use the TensorFlow library, representing the state of +the art for deep neural networks training and prediction. While our results are +encouraging, and show that the dynamics of the considered system can be +predicted for short time, we also indirectly discovered an unexpected and +undesirable behavior of the TensorFlow library. More specifically, the longer +term prediction of the system's chaotic behavior quickly deteriorates and blows +up due to the nondeterministic behavior of the TensorFlow library. Here we +provide numerical evidence of the short time prediction ability, and of the +longer term predictability blow up. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ A Fast Optimization View: Reformulating Single Layer Attention in LLM + Based on Tensor and SVM Trick, and Solving It in Matrix Multiplication Time + + +
+ Large language models (LLMs) have played a pivotal role in revolutionizing +various facets of our daily existence. Solving attention regression is a +fundamental task in optimizing LLMs. In this work, we focus on giving a +provable guarantee for the one-layer attention network objective function +$L(X,Y) = \sum_{j_0 = 1}^n \sum_{i_0 = 1}^d ( \langle \langle \exp( +\mathsf{A}_{j_0} x ) , {\bf 1}_n \rangle^{-1} \exp( \mathsf{A}_{j_0} x ), A_{3} +Y_{*,i_0} \rangle - b_{j_0,i_0} )^2$. Here $\mathsf{A} \in \mathbb{R}^{n^2 +\times d^2}$ is Kronecker product between $A_1 \in \mathbb{R}^{n \times d}$ and +$A_2 \in \mathbb{R}^{n \times d}$. $A_3$ is a matrix in $\mathbb{R}^{n \times +d}$, $\mathsf{A}_{j_0} \in \mathbb{R}^{n \times d^2}$ is the $j_0$-th block of +$\mathsf{A}$. The $X, Y \in \mathbb{R}^{d \times d}$ are variables we want to +learn. $B \in \mathbb{R}^{n \times d}$ and $b_{j_0,i_0} \in \mathbb{R}$ is one +entry at $j_0$-th row and $i_0$-th column of $B$, $Y_{*,i_0} \in \mathbb{R}^d$ +is the $i_0$-column vector of $Y$, and $x \in \mathbb{R}^{d^2}$ is the +vectorization of $X$. + In a multi-layer LLM network, the matrix $B \in \mathbb{R}^{n \times d}$ can +be viewed as the output of a layer, and $A_1= A_2 = A_3 \in \mathbb{R}^{n +\times d}$ can be viewed as the input of a layer. The matrix version of $x$ can +be viewed as $QK^\top$ and $Y$ can be viewed as $V$. We provide an iterative +greedy algorithm to train loss function $L(X,Y)$ up $\epsilon$ that runs in +$\widetilde{O}( ({\cal T}_{\mathrm{mat}}(n,n,d) + {\cal +T}_{\mathrm{mat}}(n,d,d) + d^{2\omega}) \log(1/\epsilon) )$ time. Here ${\cal +T}_{\mathrm{mat}}(a,b,c)$ denotes the time of multiplying $a \times b$ matrix +another $b \times c$ matrix, and $\omega\approx 2.37$ denotes the exponent of +matrix multiplication. + +
+
+
+
+
+ + ☆ Advancing Regular Language Reasoning in Linear Recurrent Neural Networks + + +
+ In recent studies, linear recurrent neural networks (LRNNs) have achieved +Transformer-level performance in natural language modeling and long-range +modeling while offering rapid parallel training and constant inference costs. +With the resurged interest in LRNNs, we study whether they can learn the hidden +rules in training sequences, such as the grammatical structures of regular +language. We theoretically analyze some existing LRNNs and discover their +limitations on regular language. Motivated by the analysis, we propose a new +LRNN equipped with a block-diagonal and input-dependent transition matrix. +Experiments suggest that the proposed model is the only LRNN that can perform +length extrapolation on regular language tasks such as Sum, Even Pair, and +Modular Arithmetic. + +
+
+ comment: The first two authors contributed equally to this work +
+
+
+
+
+ + ☆ Semi-supervised Domain Adaptation on Graphs with Contrastive Learning + and Minimax Entropy + + +
+ Label scarcity in a graph is frequently encountered in real-world +applications due to the high cost of data labeling. To this end, +semi-supervised domain adaptation (SSDA) on graphs aims to leverage the +knowledge of a labeled source graph to aid in node classification on a target +graph with limited labels. SSDA tasks need to overcome the domain gap between +the source and target graphs. However, to date, this challenging research +problem has yet to be formally considered by the existing approaches designed +for cross-graph node classification. To tackle the SSDA problem on graphs, a +novel method called SemiGCL is proposed, which benefits from graph contrastive +learning and minimax entropy training. SemiGCL generates informative node +representations by contrasting the representations learned from a graph's local +and global views. Additionally, SemiGCL is adversarially optimized with the +entropy loss of unlabeled target nodes to reduce domain divergence. +Experimental results on benchmark datasets demonstrate that SemiGCL outperforms +the state-of-the-art baselines on the SSDA tasks. + +
+
+
+
+
+ + ☆ Semantic Adversarial Attacks via Diffusion Models BMVC 2023 + + +
+ Traditional adversarial attacks concentrate on manipulating clean examples in +the pixel space by adding adversarial perturbations. By contrast, semantic +adversarial attacks focus on changing semantic attributes of clean examples, +such as color, context, and features, which are more feasible in the real +world. In this paper, we propose a framework to quickly generate a semantic +adversarial attack by leveraging recent diffusion models since semantic +information is included in the latent space of well-trained diffusion models. +Then there are two variants of this framework: 1) the Semantic Transformation +(ST) approach fine-tunes the latent space of the generated image and/or the +diffusion model itself; 2) the Latent Masking (LM) approach masks the latent +space with another target image and local backpropagation-based interpretation +methods. Additionally, the ST approach can be applied in either white-box or +black-box settings. Extensive experiments are conducted on CelebA-HQ and AFHQ +datasets, and our framework demonstrates great fidelity, generalizability, and +transferability compared to other baselines. Our approaches achieve +approximately 100% attack success rate in multiple settings with the best FID +as 36.61. Code is available at +https://github.com/steven202/semantic_adv_via_dm. + +
+
+ comment: To appear in BMVC 2023 +
+
+
+
+
+ + ☆ EnCodecMAE: Leveraging neural codecs for universal audio representation + learning ICASSP 2024 + + +
+ The goal of universal audio representation learning is to obtain foundational +models that can be used for a variety of downstream tasks involving speech, +music or environmental sounds. To approach this problem, methods inspired by +self-supervised models from NLP, like BERT, are often used and adapted to +audio. These models rely on the discrete nature of text, hence adopting this +type of approach for audio processing requires either a change in the learning +objective or mapping the audio signal to a set of discrete classes. In this +work, we explore the use of EnCodec, a neural audio codec, to generate discrete +targets for learning an universal audio model based on a masked autoencoder +(MAE). We evaluate this approach, which we call EncodecMAE, on a wide range of +audio tasks spanning speech, music and environmental sounds, achieving +performances comparable or better than leading audio representation models. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Rates of Convergence in Certain Native Spaces of Approximations used in + Reinforcement Learning + + +
+ This paper studies convergence rates for some value function approximations +that arise in a collection of reproducing kernel Hilbert spaces (RKHS) +$H(\Omega)$. By casting an optimal control problem in a specific class of +native spaces, strong rates of convergence are derived for the operator +equation that enables offline approximations that appear in policy iteration. +Explicit upper bounds on error in value function approximations are derived in +terms of power function $\Pwr_{H,N}$ for the space of finite dimensional +approximants $H_N$ in the native space $H(\Omega)$. These bounds are geometric +in nature and refine some well-known, now classical results concerning +convergence of approximations of value functions. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Beta quantile regression for robust estimation of uncertainty in the + presence of outliers + + +
+ Quantile Regression (QR) can be used to estimate aleatoric uncertainty in +deep neural networks and can generate prediction intervals. Quantifying +uncertainty is particularly important in critical applications such as clinical +diagnosis, where a realistic assessment of uncertainty is essential in +determining disease status and planning the appropriate treatment. The most +common application of quantile regression models is in cases where the +parametric likelihood cannot be specified. Although quantile regression is +quite robust to outlier response observations, it can be sensitive to outlier +covariate observations (features). Outlier features can compromise the +performance of deep learning regression problems such as style translation, +image reconstruction, and deep anomaly detection, potentially leading to +misleading conclusions. To address this problem, we propose a robust solution +for quantile regression that incorporates concepts from robust divergence. We +compare the performance of our proposed method with (i) least trimmed quantile +regression and (ii) robust regression based on the regularization of +case-specific parameters in a simple real dataset in the presence of outlier. +These methods have not been applied in a deep learning framework. We also +demonstrate the applicability of the proposed method by applying it to a +medical imaging translation task using diffusion models. + +
+
+
+
+
+ + ☆ The kernel-balanced equation for deep neural networks + + +
+ Deep neural networks have shown many fruitful applications in this decade. A +network can get the generalized function through training with a finite +dataset. The degree of generalization is a realization of the proximity scale +in the data space. Specifically, the scale is not clear if the dataset is +complicated. Here we consider a network for the distribution estimation of the +dataset. We show the estimation is unstable and the instability depends on the +data density and training duration. We derive the kernel-balanced equation, +which gives a short phenomenological description of the solution. The equation +tells us the reason for the instability and the mechanism of the scale. The +network outputs a local average of the dataset as a prediction and the scale of +averaging is determined along the equation. The scale gradually decreases along +training and finally results in instability in our case. + +
+
+
+
+
+ + ☆ Hodge-Aware Contrastive Learning + + +
+ Simplicial complexes prove effective in modeling data with multiway +dependencies, such as data defined along the edges of networks or within other +higher-order structures. Their spectrum can be decomposed into three +interpretable subspaces via the Hodge decomposition, resulting foundational in +numerous applications. We leverage this decomposition to develop a contrastive +self-supervised learning approach for processing simplicial data and generating +embeddings that encapsulate specific spectral information.Specifically, we +encode the pertinent data invariances through simplicial neural networks and +devise augmentations that yield positive contrastive examples with suitable +spectral properties for downstream tasks. Additionally, we reweight the +significance of negative examples in the contrastive loss, considering the +similarity of their Hodge components to the anchor. By encouraging a stronger +separation among less similar instances, we obtain an embedding space that +reflects the spectral properties of the data. The numerical results on two +standard edge flow classification tasks show a superior performance even when +compared to supervised learning techniques. Our findings underscore the +importance of adopting a spectral perspective for contrastive learning with +higher-order data. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Contrastive Tuning: A Little Help to Make Masked Autoencoders Forget + + +
+ Masked Image Modeling (MIM) methods, like Masked Autoencoders (MAE), +efficiently learn a rich representation of the input. However, for adapting to +downstream tasks, they require a sufficient amount of labeled data since their +rich features code not only objects but also less relevant image background. In +contrast, Instance Discrimination (ID) methods focus on objects. In this work, +we study how to combine the efficiency and scalability of MIM with the ability +of ID to perform downstream classification in the absence of large amounts of +labeled data. To this end, we introduce Masked Autoencoder Contrastive Tuning +(MAE-CT), a sequential approach that utilizes the implicit clustering of the +Nearest Neighbor Contrastive Learning (NNCLR) objective to induce abstraction +in the topmost layers of a pre-trained MAE. MAE-CT tunes the rich features such +that they form semantic clusters of objects without using any labels. Notably, +MAE-CT does not rely on hand-crafted augmentations and frequently achieves its +best performances while using only minimal augmentations (crop & flip). +Further, MAE-CT is compute efficient as it requires at most 10% overhead +compared to MAE re-training. Applied to large and huge Vision Transformer (ViT) +models, MAE-CT excels over previous self-supervised methods trained on ImageNet +in linear probing, k-NN and low-shot classification accuracy as well as in +unsupervised clustering accuracy. With ViT-H/16 MAE-CT achieves a new +state-of-the-art in linear probing of 82.2%. + +
+
+
+
+
+ + ♻ ☆ Machine Learning-Assisted Discovery of Novel Reactor Designs + + +
+ Additive manufacturing has enabled the fabrication of advanced reactor +geometries, permitting larger, more complex design spaces. Identifying +promising configurations within such spaces presents a significant challenge +for current approaches. Furthermore, existing parameterisations of reactor +geometries are low-dimensional with expensive optimisation limiting more +complex solutions. To address this challenge, we establish a machine +learning-assisted approach for the design of the next-generation of chemical +reactors, combining the application of high-dimensional parameterisations, +computational fluid dynamics, and multi-fidelity Bayesian optimisation. We +associate the development of mixing-enhancing vortical flow structures in novel +coiled reactors with performance, and use our approach to identify key +characteristics of optimal designs. By appealing to fluid mechanical +principles, we rationalise the selection of novel design features that lead to +experimental performance improvements of ~60% over conventional designs. Our +results demonstrate that coupling advanced manufacturing techniques with +`augmented-intelligence' approaches can lead to superior design performance +and, consequently, emissions-reduction and sustainability. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Kernel Conditional Moment Constraints for Confounding Robust Inference + + +
+ We study policy evaluation of offline contextual bandits subject to +unobserved confounders. Sensitivity analysis methods are commonly used to +estimate the policy value under the worst-case confounding over a given +uncertainty set. However, existing work often resorts to some coarse relaxation +of the uncertainty set for the sake of tractability, leading to overly +conservative estimation of the policy value. In this paper, we propose a +general estimator that provides a sharp lower bound of the policy value. It can +be shown that our estimator contains the recently proposed sharp estimator by +Dorn and Guo (2022) as a special case, and our method enables a novel extension +of the classical marginal sensitivity model using f-divergence. To construct +our estimator, we leverage the kernel method to obtain a tractable +approximation to the conditional moment constraints, which traditional +non-sharp estimators failed to take into account. In the theoretical analysis, +we provide a condition for the choice of the kernel which guarantees no +specification error that biases the lower bound estimation. Furthermore, we +provide consistency guarantees of policy evaluation and learning. In the +experiments with synthetic and real-world data, we demonstrate the +effectiveness of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Transferable Graph Neural Fingerprint Models for Quick Response to + Future Bio-Threats + + +
+ Fast screening of drug molecules based on the ligand binding affinity is an +important step in the drug discovery pipeline. Graph neural fingerprint is a +promising method for developing molecular docking surrogates with high +throughput and great fidelity. In this study, we built a COVID-19 drug docking +dataset of about 300,000 drug candidates on 23 coronavirus protein targets. +With this dataset, we trained graph neural fingerprint docking models for +high-throughput virtual COVID-19 drug screening. The graph neural fingerprint +models yield high prediction accuracy on docking scores with the mean squared +error lower than $0.21$ kcal/mol for most of the docking targets, showing +significant improvement over conventional circular fingerprint methods. To make +the neural fingerprints transferable for unknown targets, we also propose a +transferable graph neural fingerprint method trained on multiple targets. With +comparable accuracy to target-specific graph neural fingerprint models, the +transferable model exhibits superb training and data efficiency. We highlight +that the impact of this study extends beyond COVID-19 dataset, as our approach +for fast virtual ligand screening can be easily adapted and integrated into a +general machine learning-accelerated pipeline to battle future bio-threats. + +
+
+ comment: 8 pages, 5 figures, 2 tables, accepted by ICLMA2023 +
+
+
+
+
+ + ♻ ☆ Self-optimizing Feature Generation via Categorical Hashing + Representation and Hierarchical Reinforcement Crossing + + +
+ Feature generation aims to generate new and meaningful features to create a +discriminative representation space.A generated feature is meaningful when the +generated feature is from a feature pair with inherent feature interaction. In +the real world, experienced data scientists can identify potentially useful +feature-feature interactions, and generate meaningful dimensions from an +exponentially large search space, in an optimal crossing form over an optimal +generation path. But, machines have limited human-like abilities.We generalize +such learning tasks as self-optimizing feature generation. Self-optimizing +feature generation imposes several under-addressed challenges on existing +systems: meaningful, robust, and efficient generation. To tackle these +challenges, we propose a principled and generic representation-crossing +framework to solve self-optimizing feature generation.To achieve hashing +representation, we propose a three-step approach: feature discretization, +feature hashing, and descriptive summarization. To achieve reinforcement +crossing, we develop a hierarchical reinforcement feature crossing approach.We +present extensive experimental results to demonstrate the effectiveness and +efficiency of the proposed method. The code is available at +https://github.com/yingwangyang/HRC_feature_cross.git. + +
+
+
+
+
+ + ♻ ☆ SpikeCP: Delay-Adaptive Reliable Spiking Neural Networks via Conformal + Prediction + + +
+ Spiking neural networks (SNNs) process time-series data via internal +event-driven neural dynamics whose energy consumption depends on the number of +spikes exchanged between neurons over the course of the input presentation. In +typical implementations of an SNN classifier, decisions are produced after the +entire input sequence has been processed, resulting in latency and energy +consumption levels that are fairly uniform across inputs. Recently introduced +delay-adaptive SNNs tailor the inference latency -- and, with it, the energy +consumption -- to the difficulty of each example, by producing an early +decision when the SNN model is sufficiently ``confident''. In this paper, we +start by observing that, as an SNN processes input samples, its classification +decisions tend to be first under-confident and then over-confident with respect +to the decision's ground-truth, unknown, test accuracy. This makes it difficult +to determine a stopping time that ensures a desired level of accuracy. To +address this problem, we introduce a novel delay-adaptive SNN-based inference +methodology that, wrapping around any pre-trained SNN classifier, provides +guaranteed reliability for the decisions produced at input-dependent stopping +times. The approach entails minimal added complexity as compared to the +underlying SNN, requiring only thresholding and counting operations at run +time, and it leverages tools from conformal prediction (CP). + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Pareto Adversarial Robustness: Balancing Spatial Robustness and + Sensitivity-based Robustness SC + + +
+ Adversarial robustness, which primarily comprises sensitivity-based +robustness and spatial robustness, plays an integral part in achieving robust +generalization. In this paper, we endeavor to design strategies to achieve +universal adversarial robustness. To achieve this, we first investigate the +relatively less-explored realm of spatial robustness. Then, we integrate the +existing spatial robustness methods by incorporating both local and global +spatial vulnerability into a unified spatial attack and adversarial training +approach. Furthermore, we present a comprehensive relationship between natural +accuracy, sensitivity-based robustness, and spatial robustness, supported by +strong evidence from the perspective of robust representation. Crucially, to +reconcile the interplay between the mutual impacts of various robustness +components into one unified framework, we incorporate the \textit{Pareto +criterion} into the adversarial robustness analysis, yielding a novel strategy +called Pareto Adversarial Training for achieving universal robustness. The +resulting Pareto front, which delineates the set of optimal solutions, provides +an optimal balance between natural accuracy and various adversarial robustness. +This sheds light on solutions for achieving universal robustness in the future. +To the best of our knowledge, we are the first to consider universal +adversarial robustness via multi-objective optimization. + +
+
+ comment: Published in SCIENCE CHINA Information Sciences (SCIS) 2023. Please + also refer to the published version in the Journal reference + https://www.sciengine.com/SCIS/doi/10.1007/s11432-022-3861-8 +
+
+
+
+
+ + ♻ ☆ Gaussian Process Surrogate Models for Neural Networks UAI 2023 + + +
+ Not being able to understand and predict the behavior of deep learning +systems makes it hard to decide what architecture and algorithm to use for a +given problem. In science and engineering, modeling is a methodology used to +understand complex systems whose internal processes are opaque. Modeling +replaces a complex system with a simpler, more interpretable surrogate. Drawing +inspiration from this, we construct a class of surrogate models for neural +networks using Gaussian processes. Rather than deriving kernels for infinite +neural networks, we learn kernels empirically from the naturalistic behavior of +finite neural networks. We demonstrate our approach captures existing phenomena +related to the spectral bias of neural networks, and then show that our +surrogate models can be used to solve practical problems such as identifying +which points most influence the behavior of specific neural networks and +predicting which architectures and algorithms will generalize well for specific +datasets. + +
+
+ comment: Proceedings of UAI 2023 +
+
+
+
+
+ + ♻ ☆ Effective Latent Differential Equation Models via Attention and Multiple + Shooting + + +
+ Scientific Machine Learning (SciML) is a burgeoning field that +synergistically combines domain-aware and interpretable models with agnostic +machine learning techniques. In this work, we introduce GOKU-UI, an evolution +of the SciML generative model GOKU-nets. GOKU-UI not only broadens the original +model's spectrum to incorporate other classes of differential equations, such +as Stochastic Differential Equations (SDEs), but also integrates attention +mechanisms and a novel multiple shooting training strategy in the latent space. +These modifications have led to a significant increase in its performance in +both reconstruction and forecast tasks, as demonstrated by our evaluation of +simulated and empirical data. Specifically, GOKU-UI outperformed all baseline +models on synthetic datasets even with a training set 16-fold smaller, +underscoring its remarkable data efficiency. Furthermore, when applied to +empirical human brain data, while incorporating stochastic Stuart-Landau +oscillators into its dynamical core, our proposed enhancements markedly +increased the model's effectiveness in capturing complex brain dynamics. This +augmented version not only surpassed all baseline methods in the reconstruction +task, but also demonstrated lower prediction error of future brain activity up +to 15 seconds ahead. By training GOKU-UI on resting state fMRI data, we encoded +whole-brain dynamics into a latent representation, learning a low-dimensional +dynamical system model that could offer insights into brain functionality and +open avenues for practical applications such as the classification of mental +states or psychiatric conditions. Ultimately, our research provides further +impetus for the field of Scientific Machine Learning, showcasing the potential +for advancements when established scientific insights are interwoven with +modern machine learning. + +
+
+
+
+
+ + ♻ ☆ Meta-Learning Regrasping Strategies for Physical-Agnostic Objects ICRA 2022 + + +
+ Grasping inhomogeneous objects in real-world applications remains a +challenging task due to the unknown physical properties such as mass +distribution and coefficient of friction. In this study, we propose a +meta-learning algorithm called ConDex, which incorporates Conditional Neural +Processes (CNP) with DexNet-2.0 to autonomously discern the underlying physical +properties of objects using depth images. ConDex efficiently acquires physical +embeddings from limited trials, enabling precise grasping point estimation. +Furthermore, ConDex is capable of updating the predicted grasping quality +iteratively from new trials in an online fashion. To the best of our knowledge, +we are the first who generate two object datasets focusing on inhomogeneous +physical properties with varying mass distributions and friction coefficients. +Extensive evaluations in simulation demonstrate ConDex's superior performance +over DexNet-2.0 and existing meta-learning-based grasping pipelines. +Furthermore, ConDex shows robust generalization to previously unseen real-world +objects despite training solely in the simulation. The synthetic and real-world +datasets will be published as well. + +
+
+ comment: Accepted as spotlight in ICRA 2022 Workshop: Scaling Robot Learning +
+
+
+
+
+ + ♻ ☆ Domain Generalization for Crop Segmentation with Knowledge Distillation + + +
+ In recent years, precision agriculture has gradually oriented farming closer +to automation processes to support all the activities related to field +management. Service robotics plays a predominant role in this evolution by +deploying autonomous agents that can navigate fields while performing tasks +without human intervention, such as monitoring, spraying, and harvesting. To +execute these precise actions, mobile robots need a real-time perception system +that understands their surroundings and identifies their targets in the wild. +Generalizing to new crops and environmental conditions is critical for +practical applications, as labeled samples are rarely available. In this paper, +we investigate the problem of crop segmentation and propose a novel approach to +enhance domain generalization using knowledge distillation. In the proposed +framework, we transfer knowledge from an ensemble of models individually +trained on source domains to a student model that can adapt to unseen target +domains. To evaluate the proposed method, we present a synthetic multi-domain +dataset for crop segmentation containing plants of variegate shapes and +covering different terrain styles, weather conditions, and light scenarios for +more than 50,000 samples. We demonstrate significant improvements in +performance over state-of-the-art methods and superior sim-to-real +generalization. Our approach provides a promising solution for domain +generalization in crop segmentation and has the potential to enhance a wide +variety of precision agriculture applications. + +
+
+
+
+
+ + ♻ ☆ MAHTM: A Multi-Agent Framework for Hierarchical Transactive Microgrids ICLR 2023 + + +
+ Integrating variable renewable energy into the grid has posed challenges to +system operators in achieving optimal trade-offs among energy availability, +cost affordability, and pollution controllability. This paper proposes a +multi-agent reinforcement learning framework for managing energy transactions +in microgrids. The framework addresses the challenges above: it seeks to +optimize the usage of available resources by minimizing the carbon footprint +while benefiting all stakeholders. The proposed architecture consists of three +layers of agents, each pursuing different objectives. The first layer, +comprised of prosumers and consumers, minimizes the total energy cost. The +other two layers control the energy price to decrease the carbon impact while +balancing the consumption and production of both renewable and conventional +energy. This framework also takes into account fluctuations in energy demand +and supply. + +
+
+ comment: ICLR 2023 Workshop: Tackling Climate Change with Machine Learning +
+
+
+
+
+ + ♻ ☆ TargetCall: Eliminating the Wasted Computation in Basecalling via + Pre-Basecalling Filtering + + +
+ Basecalling is an essential step in nanopore sequencing analysis where the +raw signals of nanopore sequencers are converted into nucleotide sequences, +i.e., reads. State-of-the-art basecallers employ complex deep learning models +to achieve high basecalling accuracy. This makes basecalling +computationally-inefficient and memory-hungry; bottlenecking the entire genome +analysis pipeline. However, for many applications, the majority of reads do no +match the reference genome of interest (i.e., target reference) and thus are +discarded in later steps in the genomics pipeline, wasting the basecalling +computation. To overcome this issue, we propose TargetCall, the first +pre-basecalling filter to eliminate the wasted computation in basecalling. +TargetCall's key idea is to discard reads that will not match the target +reference (i.e., off-target reads) prior to basecalling. TargetCall consists of +two main components: (1) LightCall, a lightweight neural network basecaller +that produces noisy reads; and (2) Similarity Check, which labels each of these +noisy reads as on-target or off-target by matching them to the target +reference. TargetCall aims to filter out all off-target reads before +basecalling. The highly-accurate but slow basecalling is performed only on the +raw signals whose noisy reads are labeled as on-target. Our thorough +experimental evaluations using both real and simulated data show that +TargetCall 1) improves the end-to-end basecalling performance while maintaining +high sensitivity in keeping on-target reads, 2) maintains high accuracy in +downstream analysis, 3) precisely filters out up to 94.71% of off-target reads, +and 4) achieves better performance, throughput, sensitivity, precision, and +generality compared to prior works. We open-source TargetCall at +https://github.com/CMU-SAFARI/TargetCall + +
+
+
+
+
+ + ♻ ☆ Scalable Bayesian optimization with high-dimensional outputs using + randomized prior networks + + +
+ Several fundamental problems in science and engineering consist of global +optimization tasks involving unknown high-dimensional (black-box) functions +that map a set of controllable variables to the outcomes of an expensive +experiment. Bayesian Optimization (BO) techniques are known to be effective in +tackling global optimization problems using a relatively small number objective +function evaluations, but their performance suffers when dealing with +high-dimensional outputs. To overcome the major challenge of dimensionality, +here we propose a deep learning framework for BO and sequential decision making +based on bootstrapped ensembles of neural architectures with randomized priors. +Using appropriate architecture choices, we show that the proposed framework can +approximate functional relationships between design variables and quantities of +interest, even in cases where the latter take values in high-dimensional vector +spaces or even infinite-dimensional function spaces. In the context of BO, we +augmented the proposed probabilistic surrogates with re-parameterized Monte +Carlo approximations of multiple-point (parallel) acquisition functions, as +well as methodological extensions for accommodating black-box constraints and +multi-fidelity information sources. We test the proposed framework against +state-of-the-art methods for BO and demonstrate superior performance across +several challenging tasks with high-dimensional outputs, including a +constrained multi-fidelity optimization task involving shape optimization of +rotor blades in turbo-machinery. + +
+
+ comment: 23 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ TrojViT: Trojan Insertion in Vision Transformers + + +
+ Vision Transformers (ViTs) have demonstrated the state-of-the-art performance +in various vision-related tasks. The success of ViTs motivates adversaries to +perform backdoor attacks on ViTs. Although the vulnerability of traditional +CNNs to backdoor attacks is well-known, backdoor attacks on ViTs are +seldom-studied. Compared to CNNs capturing pixel-wise local features by +convolutions, ViTs extract global context information through patches and +attentions. Na\"ively transplanting CNN-specific backdoor attacks to ViTs +yields only a low clean data accuracy and a low attack success rate. In this +paper, we propose a stealth and practical ViT-specific backdoor attack +$TrojViT$. Rather than an area-wise trigger used by CNN-specific backdoor +attacks, TrojViT generates a patch-wise trigger designed to build a Trojan +composed of some vulnerable bits on the parameters of a ViT stored in DRAM +memory through patch salience ranking and attention-target loss. TrojViT +further uses minimum-tuned parameter update to reduce the bit number of the +Trojan. Once the attacker inserts the Trojan into the ViT model by flipping the +vulnerable bits, the ViT model still produces normal inference accuracy with +benign inputs. But when the attacker embeds a trigger into an input, the ViT +model is forced to classify the input to a predefined target class. We show +that flipping only few vulnerable bits identified by TrojViT on a ViT model +using the well-known RowHammer can transform the model into a backdoored one. +We perform extensive experiments of multiple datasets on various ViT models. +TrojViT can classify $99.64\%$ of test images to a target class by flipping +$345$ bits on a ViT for ImageNet.Our codes are available at +https://github.com/mxzheng/TrojViT + +
+
+ comment: 10 pages, 4 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Machine Learning and Computer Vision Techniques in Continuous Beehive + Monitoring Applications: A survey + + +
+ Wide use and availability of the machine learning and computer vision +techniques allows development of relatively complex monitoring systems in many +domains. Besides the traditional industrial domain, new application appears +also in biology and agriculture, where we could speak about the detection of +infections, parasites and weeds, but also about automated monitoring and early +warning systems. This is also connected with the introduction of the easily +accessible hardware and development kits such as Arduino, or RaspberryPi +family. In this paper, we survey 50 existing papers focusing on the methods of +automated beehive monitoring methods using the computer vision techniques, +particularly on the pollen and Varroa mite detection together with the bee +traffic monitoring. Such systems could also be used for the monitoring of the +honeybee colonies and for the inspection of their health state, which could +identify potentially dangerous states before the situation is critical, or to +better plan periodic bee colony inspections and therefore save significant +costs. Later, we also include analysis of the research trends in this +application field and we outline the possible direction of the new +explorations. Our paper is aimed also at veterinary and apidology professionals +and experts, who might not be familiar with machine learning to introduce them +to its possibilities, therefore each family of applications is opened by a +brief theoretical introduction and motivation related to its base method. We +hope that this paper will inspire other scientists to use machine learning +techniques for other applications in beehive monitoring. + +
+
+
+
+
+ + ♻ ☆ GBE-MLZSL: A Group Bi-Enhancement Framework for Multi-Label Zero-Shot + Learning + + +
+ This paper investigates a challenging problem of zero-shot learning in the +multi-label scenario (MLZSL), wherein, the model is trained to recognize +multiple unseen classes within a sample (e.g., an image) based on seen classes +and auxiliary knowledge, e.g., semantic information. Existing methods usually +resort to analyzing the relationship of various seen classes residing in a +sample from the dimension of spatial or semantic characteristics, and transfer +the learned model to unseen ones. But they ignore the effective integration of +local and global features. That is, in the process of inferring unseen classes, +global features represent the principal direction of the image in the feature +space, while local features should maintain uniqueness within a certain range. +This integrated neglect will make the model lose its grasp of the main +components of the image. Relying only on the local existence of seen classes +during the inference stage introduces unavoidable bias. In this paper, we +propose a novel and effective group bi-enhancement framework for MLZSL, dubbed +GBE-MLZSL, to fully make use of such properties and enable a more accurate and +robust visual-semantic projection. Specifically, we split the feature maps into +several feature groups, of which each feature group can be trained +independently with the Local Information Distinguishing Module (LID) to ensure +uniqueness. Meanwhile, a Global Enhancement Module (GEM) is designed to +preserve the principal direction. Besides, a static graph structure is designed +to construct the correlation of local features. Experiments on large-scale +MLZSL benchmark datasets NUS-WIDE and Open-Images-v4 demonstrate that the +proposed GBE-MLZSL outperforms other state-of-the-art methods with large +margins. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Deep reinforced learning heuristic tested on spin-glass ground states: + The larger picture + + +
+ In Changjun Fan et al. [Nature Communications +https://doi.org/10.1038/s41467-023-36363-w (2023)], the authors present a deep +reinforced learning approach to augment combinatorial optimization heuristics. +In particular, they present results for several spin glass ground state +problems, for which instances on non-planar networks are generally NP-hard, in +comparison with several Monte Carlo based methods, such as simulated annealing +(SA) or parallel tempering (PT). Indeed, those results demonstrate that the +reinforced learning improves the results over those obtained with SA or PT, or +at least allows for reduced runtimes for the heuristics before results of +comparable quality have been obtained relative to those other methods. To +facilitate the conclusion that their method is ''superior'', the authors pursue +two basic strategies: (1) A commercial GUROBI solver is called on to procure a +sample of exact ground states as a testbed to compare with, and (2) a +head-to-head comparison between the heuristics is given for a sample of larger +instances where exact ground states are hard to ascertain. Here, we put these +studies into a larger context, showing that the claimed superiority is at best +marginal for smaller samples and becomes essentially irrelevant with respect to +any sensible approximation of true ground states in the larger samples. For +example, this method becomes irrelevant as a means to determine stiffness +exponents $\theta$ in $d>2$, as mentioned by the authors, where the problem is +not only NP-hard but requires the subtraction of two almost equal ground-state +energies and systemic errors in each of $\approx 1\%$ found here are +unacceptable. This larger picture on the method arises from a straightforward +finite-size corrections study over the spin glass ensembles the authors employ, +using data that has been available for decades. + +
+
+ comment: 5 pages, 2 figures, comment on arXiv:2109.14411, related information + can be found at https://physics.emory.edu/faculty/boettcher/ +
+
+
+
+
+ + ♻ ☆ An Adaptive Federated Relevance Framework for Spatial Temporal Graph + Learning + + +
+ Spatial-temporal data contains rich information and has been widely studied +in recent years due to the rapid development of relevant applications in many +fields. For instance, medical institutions often use electrodes attached to +different parts of a patient to analyse the electorencephal data rich with +spatial and temporal features for health assessment and disease diagnosis. +Existing research has mainly used deep learning techniques such as +convolutional neural network (CNN) or recurrent neural network (RNN) to extract +hidden spatial-temporal features. Yet, it is challenging to incorporate both +inter-dependencies spatial information and dynamic temporal changes +simultaneously. In reality, for a model that leverages these spatial-temporal +features to fulfil complex prediction tasks, it often requires a colossal +amount of training data in order to obtain satisfactory model performance. +Considering the above-mentioned challenges, we propose an adaptive federated +relevance framework, namely FedRel, for spatial-temporal graph learning in this +paper. After transforming the raw spatial-temporal data into high quality +features, the core Dynamic Inter-Intra Graph (DIIG) module in the framework is +able to use these features to generate the spatial-temporal graphs capable of +capturing the hidden topological and long-term temporal correlation information +in these graphs. To improve the model generalization ability and performance +while preserving the local data privacy, we also design a relevance-driven +federated learning module in our framework to leverage diverse data +distributions from different participants with attentive aggregations of their +models. + +
+
+
+
+
+ + ♻ ☆ An Optimal Control Method to Compute the Most Likely Transition Path for + Stochastic Dynamical Systems with Jumps + + +
+ Many complex real world phenomena exhibit abrupt, intermittent or jumping +behaviors, which are more suitable to be described by stochastic differential +equations under non-Gaussian L\'evy noise. Among these complex phenomena, the +most likely transition paths between metastable states are important since +these rare events may have a high impact in certain scenarios. Based on the +large deviation principle, the most likely transition path could be treated as +the minimizer of the rate function upon paths that connect two points. One of +the challenges to calculate the most likely transition path for stochastic +dynamical systems under non-Gaussian L\'evy noise is that the associated rate +function can not be explicitly expressed by paths. For this reason, we +formulate an optimal control problem to obtain the optimal state as the most +likely transition path. We then develop a neural network method to solve this +issue. Several experiments are investigated for both Gaussian and non-Gaussian +cases. + +
+
+ comment: 17 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Discrete Acoustic Space for an Efficient Sampling in Neural + Text-To-Speech SP + + +
+ We present a Split Vector Quantized Variational Autoencoder (SVQ-VAE) +architecture using a split vector quantizer for NTTS, as an enhancement to the +well-known Variational Autoencoder (VAE) and Vector Quantized Variational +Autoencoder (VQ-VAE) architectures. Compared to these previous architectures, +our proposed model retains the benefits of using an utterance-level bottleneck, +while keeping significant representation power and a discretized latent space +small enough for efficient prediction from text. We train the model on +recordings in the expressive task-oriented dialogues domain and show that +SVQ-VAE achieves a statistically significant improvement in naturalness over +the VAE and VQ-VAE models. Furthermore, we demonstrate that the SVQ-VAE latent +acoustic space is predictable from text, reducing the gap between the standard +constant vector synthesis and vocoded recordings by 32%. + +
+
+ comment: 5 pages, 5 figures, accepted at IberSPEECH 2022 +
+
+
+
+
+ + ♻ ☆ Evaluation of Parameter-based Attacks against Embedded Neural Networks + with Laser Injection + + +
+ Upcoming certification actions related to the security of machine learning +(ML) based systems raise major evaluation challenges that are amplified by the +large-scale deployment of models in many hardware platforms. Until recently, +most of research works focused on API-based attacks that consider a ML model as +a pure algorithmic abstraction. However, new implementation-based threats have +been revealed, emphasizing the urgency to propose both practical and +simulation-based methods to properly evaluate the robustness of models. A major +concern is parameter-based attacks (such as the Bit-Flip Attack, BFA) that +highlight the lack of robustness of typical deep neural network models when +confronted by accurate and optimal alterations of their internal parameters +stored in memory. Setting in a security testing purpose, this work practically +reports, for the first time, a successful variant of the BFA on a 32-bit +Cortex-M microcontroller using laser fault injection. It is a standard fault +injection means for security evaluation, that enables to inject spatially and +temporally accurate faults. To avoid unrealistic brute-force strategies, we +show how simulations help selecting the most sensitive set of bits from the +parameters taking into account the laser fault model. + +
+
+ comment: Accepted at 42nd International Conference on Computer Safety, + Reliability and Security, SafeComp 2023 +
+
+
+
+
+ + ♻ ☆ Generating Parametric BRDFs from Natural Language Descriptions + + +
+ Artistic authoring of 3D environments is a laborious enterprise that also +requires skilled content creators. There have been impressive improvements in +using machine learning to address different aspects of generating 3D content, +such as generating meshes, arranging geometry, synthesizing textures, etc. In +this paper we develop a model to generate Bidirectional Reflectance +Distribution Functions (BRDFs) from descriptive textual prompts. BRDFs are four +dimensional probability distributions that characterize the interaction of +light with surface materials. They are either represented parametrically, or by +tabulating the probability density associated with every pair of incident and +outgoing angles. The former lends itself to artistic editing while the latter +is used when measuring the appearance of real materials. Numerous works have +focused on hypothesizing BRDF models from images of materials. We learn a +mapping from textual descriptions of materials to parametric BRDFs. Our model +is first trained using a semi-supervised approach before being tuned via an +unsupervised scheme. Although our model is general, in this paper we +specifically generate parameters for MDL materials, conditioned on natural +language descriptions, within NVIDIA's Omniverse platform. This enables use +cases such as real-time text prompts to change materials of objects in 3D +environments such as "dull plastic" or "shiny iron". Since the output of our +model is a parametric BRDF, rather than an image of the material, it may be +used to render materials using any shape under arbitrarily specified viewing +and lighting conditions. + +
+
+
+
+
+ + ♻ ☆ Survival Estimation for Missing not at Random Censoring Indicators based + on Copula Models + + +
+ In the presence of right-censored data with covariates, the conditional +Kaplan-Meier estimator (also known as the Beran estimator) consistently +estimates the conditional survival function of the random follow-up for the +event of interest. However, a necessary condition is the unambiguous knowledge +of whether each individual is censored or not, which may be incomplete in +practice. We therefore propose a study of the Beran estimator when the +censoring indicators are generic random variables and discuss necessary +conditions for the efficiency of the Beran estimator. From this, we provide a +new estimator for the conditional survival function with missing not at random +(MNAR) censoring indicators based on a conditional copula model for the +missingness mechanism. In addition to the theoretical results, we illustrate +how the estimators work for small samples through a simulation study and show +their practical applicability by analyzing synthetic and real data. + +
+
+
+
+
+ + ♻ ☆ Interpretable Weighted Siamese Network to Predict the Time to Onset of + Alzheimer's Disease from MRI Images + + +
+ Alzheimer's Disease (AD) is a progressive disease preceded by Mild Cognitive +Impairment (MCI). Early detection of AD is crucial for making treatment +decisions. However, most of the literature on computer-assisted detection of AD +focuses on classifying brain images into one of three major categories: +healthy, MCI, and AD; or categorizing MCI patients into (1) progressive: those +who progress from MCI to AD at a future examination time, and (2) stable: those +who stay as MCI and never progress to AD. This misses the opportunity to +accurately identify the trajectory of progressive MCI patients. In this paper, +we revisit the brain image classification task for AD identification and +re-frame it as an ordinal classification task to predict how close a patient is +to the severe AD stage. To this end, we select progressive MCI patients from +the Alzheimer's Disease Neuroimaging Initiative (ADNI) dataset and construct an +ordinal dataset with a prediction target that indicates the time to progression +to AD. We train a Siamese network model to predict the time to onset of AD +based on MRI brain images. We also propose a Weighted variety of Siamese +network and compare its performance to a baseline model. Our evaluations show +that incorporating a weighting factor to Siamese networks brings considerable +performance gain at predicting how close input brain MRI images are to +progressing to AD. Moreover, we complement our results with an interpretation +of the learned embedding space of the Siamese networks using a model +explainability technique. + +
+
+ comment: Accepted at the Specialist Group on Artificial Intelligence, SGAI + 2023, conference +
+
+
+
+
+ + ♻ ☆ DoRA: Domain-Based Self-Supervised Learning Framework for Low-Resource + Real Estate Appraisal CIKM 2023 + + +
+ The marketplace system connecting demands and supplies has been explored to +develop unbiased decision-making in valuing properties. Real estate appraisal +serves as one of the high-cost property valuation tasks for financial +institutions since it requires domain experts to appraise the estimation based +on the corresponding knowledge and the judgment of the market. Existing +automated valuation models reducing the subjectivity of domain experts require +a large number of transactions for effective evaluation, which is predominantly +limited to not only the labeling efforts of transactions but also the +generalizability of new developing and rural areas. To learn representations +from unlabeled real estate sets, existing self-supervised learning (SSL) for +tabular data neglects various important features, and fails to incorporate +domain knowledge. In this paper, we propose DoRA, a Domain-based +self-supervised learning framework for low-resource Real estate Appraisal. DoRA +is pre-trained with an intra-sample geographic prediction as the pretext task +based on the metadata of the real estate for equipping the real estate +representations with prior domain knowledge. Furthermore, inter-sample +contrastive learning is employed to generalize the representations to be robust +for limited transactions of downstream tasks. Our benchmark results on three +property types of real-world transactions show that DoRA significantly +outperforms the SSL baselines for tabular data, the graph-based methods, and +the supervised approaches in the few-shot scenarios by at least 7.6% for MAPE, +11.59% for MAE, and 3.34% for HR10%. We expect DoRA to be useful to other +financial practitioners with similar marketplace applications who need general +models for properties that are newly built and have limited records. The source +code is available at https://github.com/wwweiwei/DoRA. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ Optimal transport distances for directed, weighted graphs: a case study + with cell-cell communication networks + + +
+ Comparing graphs by means of optimal transport has recently gained +significant attention, as the distances induced by optimal transport provide +both a principled metric between graphs as well as an interpretable description +of the associated changes between graphs in terms of a transport plan. As the +lack of symmetry introduces challenges in the typically considered +formulations, optimal transport distances for graphs have mostly been developed +for undirected graphs. Here, we propose two distance measures to compare +directed graphs based on variants of optimal transport: (i) an earth movers +distance (Wasserstein) and (ii) a Gromov-Wasserstein (GW) distance. We evaluate +these two distances and discuss their relative performance for both simulated +graph data and real-world directed cell-cell communication graphs, inferred +from single-cell RNA-seq data. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Learning nonparametric DAGs with incremental information via high-order + HSIC + + +
+ Score-based methods for learning Bayesain networks(BN) aim to maximizing the +global score functions. However, if local variables have direct and indirect +dependence simultaneously, the global optimization on score functions misses +edges between variables with indirect dependent relationship, of which scores +are smaller than those with direct dependent relationship. In this paper, we +present an identifiability condition based on a determined subset of parents to +identify the underlying DAG. By the identifiability condition, we develop a +two-phase algorithm namely optimal-tuning (OT) algorithm to locally amend the +global optimization. In the optimal phase, an optimization problem based on +first-order Hilbert-Schmidt independence criterion (HSIC) gives an estimated +skeleton as the initial determined parents subset. In the tuning phase, the +skeleton is locally tuned by deletion, addition and DAG-formalization +strategies using the theoretically proved incremental properties of high-order +HSIC. Numerical experiments for different synthetic datasets and real-world +datasets show that the OT algorithm outperforms existing methods. Especially in +Sigmoid Mix model with the size of the graph being ${\rm\bf d=40}$, the +structure intervention distance (SID) of the OT algorithm is 329.7 smaller than +the one obtained by CAM, which indicates that the graph estimated by the OT +algorithm misses fewer edges compared with CAM.Source code of the OT algorithm +is available at https://github.com/YafeiannWang/optimal-tune-algorithm. + +
+
+
+
+
+ + ♻ ☆ Gibbs-Duhem-Informed Neural Networks for Binary Activity Coefficient + Prediction + + +
+ We propose Gibbs-Duhem-informed neural networks for the prediction of binary +activity coefficients at varying compositions. That is, we include the +Gibbs-Duhem equation explicitly in the loss function for training neural +networks, which is straightforward in standard machine learning (ML) frameworks +enabling automatic differentiation. In contrast to recent hybrid ML approaches, +our approach does not rely on embedding a specific thermodynamic model inside +the neural network and corresponding prediction limitations. Rather, +Gibbs-Duhem consistency serves as regularization, with the flexibility of ML +models being preserved. Our results show increased thermodynamic consistency +and generalization capabilities for activity coefficient predictions by +Gibbs-Duhem-informed graph neural networks and matrix completion methods. We +also find that the model architecture, particularly the activation function, +can have a strong influence on the prediction quality. The approach can be +easily extended to account for other thermodynamic consistency conditions. + +
+
+
+
+
+ + ♻ ☆ PolicyCleanse: Backdoor Detection and Mitigation in Reinforcement + Learning ICCV 2023 + + +
+ While real-world applications of reinforcement learning are becoming popular, +the security and robustness of RL systems are worthy of more attention and +exploration. In particular, recent works have revealed that, in a multi-agent +RL environment, backdoor trigger actions can be injected into a victim agent +(a.k.a. Trojan agent), which can result in a catastrophic failure as soon as it +sees the backdoor trigger action. To ensure the security of RL agents against +malicious backdoors, in this work, we propose the problem of Backdoor Detection +in a multi-agent competitive reinforcement learning system, with the objective +of detecting Trojan agents as well as the corresponding potential trigger +actions, and further trying to mitigate their Trojan behavior. In order to +solve this problem, we propose PolicyCleanse that is based on the property that +the activated Trojan agents accumulated rewards degrade noticeably after +several timesteps. Along with PolicyCleanse, we also design a machine +unlearning-based approach that can effectively mitigate the detected backdoor. +Extensive experiments demonstrate that the proposed methods can accurately +detect Trojan agents, and outperform existing backdoor mitigation baseline +approaches by at least 3% in winning rate across various types of agents and +environments. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ConSpec: honing in on critical steps for rapid learning and + generalization in RL + + +
+ In real life, success is often contingent upon multiple critical steps that +are distant in time from each other and from the final reward. These critical +steps are challenging to identify with traditional reinforcement learning (RL) +methods that rely on the Bellman equation for credit assignment. Here, we +present a new RL algorithm that uses offline contrastive learning to hone in on +critical steps. This algorithm, which we call contrastive introspection +(ConSpec), can be added to any existing RL algorithm. ConSpec learns a set of +prototypes for the critical steps in a task by a novel contrastive loss and +delivers an intrinsic reward when the current state matches one of these +prototypes. The prototypes in ConSpec provide two key benefits for credit +assignment: (1) They enable rapid identification of all the critical steps. (2) +They do so in a readily interpretable manner, enabling out-of-distribution +generalization when sensory features are altered. Distinct from other +contemporary RL approaches to credit assignment, ConSpec takes advantage of the +fact that it is easier to retrospectively identify the small set of steps that +success is contingent upon than it is to prospectively predict reward at every +step taken in the environment. Altogether, ConSpec improves learning in a +diverse set of RL tasks, including both those with explicit, discrete critical +steps and those with complex, continuous critical steps. + +
+
+
+
+
+ + ♻ ☆ Conformal Regression in Calorie Prediction for Team Jumbo-Visma + + +
+ UCI WorldTour races, the premier men's elite road cycling tour, are grueling +events that put physical fitness and endurance of riders to the test. The +coaches of Team Jumbo-Visma have long been responsible for predicting the +energy needs of each rider of the Dutch team for every race on the calendar. +Those must be estimated to ensure riders have the energy and resources +necessary to maintain a high level of performance throughout a race. This task, +however, is both time-consuming and challenging, as it requires precise +estimates of race speed and power output. Traditionally, the approach to +predicting energy needs has relied on judgement and experience of coaches, but +this method has its limitations and often leads to inaccurate predictions. In +this paper, we propose a new, more effective approach to predicting energy +needs for cycling races. By predicting the speed and power with regression +models, we provide the coaches with calorie needs estimates for each individual +rider per stage instantly. In addition, we compare methods to quantify +uncertainty using conformal prediction. The empirical analysis of the +jackknife+, jackknife-minmax, jackknife-minmax-after-bootstrap, CV+, CV-minmax, +conformalized quantile regression, and inductive conformal prediction methods +in conformal prediction reveals that all methods achieve valid prediction +intervals. All but minmax-based methods also produce sufficiently narrow +prediction intervals for decision-making. Furthermore, methods computing +prediction intervals of fixed size produce tighter intervals for low +significance values. Among the methods computing intervals of varying length +across the input space, inductive conformal prediction computes narrower +prediction intervals at larger significance level. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Speeding up Learning Quantum States through Group Equivariant + Convolutional Quantum Ansätze + + +
+ We develop a theoretical framework for $S_n$-equivariant convolutional +quantum circuits with SU$(d)$-symmetry, building on and significantly +generalizing Jordan's Permutational Quantum Computing (PQC) formalism based on +Schur-Weyl duality connecting both SU$(d)$ and $S_n$ actions on qudits. In +particular, we utilize the Okounkov-Vershik approach to prove Harrow's +statement (Ph.D. Thesis 2005 p.160) on the equivalence between +$\operatorname{SU}(d)$ and $S_n$ irrep bases and to establish the +$S_n$-equivariant Convolutional Quantum Alternating Ans\"atze ($S_n$-CQA) using +Young-Jucys-Murphy (YJM) elements. We prove that $S_n$-CQA is able to generate +any unitary in any given $S_n$ irrep sector, which may serve as a universal +model for a wide array of quantum machine learning problems with the presence +of SU($d$) symmetry. Our method provides another way to prove the universality +of Quantum Approximate Optimization Algorithm (QAOA) and verifies that 4-local +SU($d$) symmetric unitaries are sufficient to build generic SU($d$) symmetric +quantum circuits up to relative phase factors. We present numerical simulations +to showcase the effectiveness of the ans\"atze to find the ground state energy +of the $J_1$--$J_2$ antiferromagnetic Heisenberg model on the rectangular and +Kagome lattices. Our work provides the first application of the celebrated +Okounkov-Vershik's $S_n$ representation theory to quantum physics and machine +learning, from which to propose quantum variational ans\"atze that strongly +suggests to be classically intractable tailored towards a specific optimization +problem. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ LambdaKG: A Library for Pre-trained Language Model-Based Knowledge Graph + Embeddings AACL 2023 + + +
+ Knowledge Graphs (KGs) often have two characteristics: heterogeneous graph +structure and text-rich entity/relation information. Text-based KG embeddings +can represent entities by encoding descriptions with pre-trained language +models, but no open-sourced library is specifically designed for KGs with PLMs +at present. In this paper, we present LambdaKG, a library for KGE that equips +with many pre-trained language models (e.g., BERT, BART, T5, GPT-3), and +supports various tasks (e.g., knowledge graph completion, question answering, +recommendation, and knowledge probing). LambdaKG is publicly open-sourced at +https://github.com/zjunlp/PromptKG/tree/main/lambdaKG, with a demo video at +http://deepke.zjukg.cn/lambdakg.mp4 and long-term maintenance. + +
+
+ comment: AACL 2023 System Demonstrations, the project website is + https://zjunlp.github.io/project/promptkg/ +
+
+
+
+
+ + ♻ ☆ Correcting sampling biases via importance reweighting for spatial + modeling + + +
+ In machine learning models, the estimation of errors is often complex due to +distribution bias, particularly in spatial data such as those found in +environmental studies. We introduce an approach based on the ideas of +importance sampling to obtain an unbiased estimate of the target error. By +taking into account difference between desirable error and available data, our +method reweights errors at each sample point and neutralizes the shift. +Importance sampling technique and kernel density estimation were used for +reweighteing. We validate the effectiveness of our approach using artificial +data that resemble real-world spatial datasets. Our findings demonstrate +advantages of the proposed approach for the estimation of the target error, +offering a solution to a distribution shift problem. Overall error of +predictions dropped from 7% to just 2% and it gets smaller for larger samples. + +
+
+
+
+
+ + ♻ ☆ Joint Community Detection and Rotational Synchronization via + Semidefinite Programming + + +
+ In the presence of heterogeneous data, where randomly rotated objects fall +into multiple underlying categories, it is challenging to simultaneously +classify them into clusters and synchronize them based on pairwise relations. +This gives rise to the joint problem of community detection and +synchronization. We propose a series of semidefinite relaxations, and prove +their exact recovery when extending the celebrated stochastic block model to +this new setting where both rotations and cluster identities are to be +determined. Numerical experiments demonstrate the efficacy of our proposed +algorithms and confirm our theoretical result which indicates a sharp phase +transition for exact recovery. + +
+
+
+
+
+ + ♻ ☆ Reasoning with Language Model Prompting: A Survey ACL 2023 + + +
+ Reasoning, as an essential ability for complex problem-solving, can provide +back-end support for various real-world applications, such as medical +diagnosis, negotiation, etc. This paper provides a comprehensive survey of +cutting-edge research on reasoning with language model prompting. We introduce +research works with comparisons and summaries and provide systematic resources +to help beginners. We also discuss the potential reasons for emerging such +reasoning abilities and highlight future research directions. Resources are +available at https://github.com/zjunlp/Prompt4ReasoningPapers (updated +periodically). + +
+
+ comment: ACL 2023, 24 pages, add references of theoretical analysis +
+
+
+
+
+ + ♻ ☆ On Gradient Descent Ascent for Nonconvex-Concave Minimax Problems ICML 2020 + + +
+ We consider nonconvex-concave minimax problems, $\min_{\mathbf{x}} +\max_{\mathbf{y} \in \mathcal{Y}} f(\mathbf{x}, \mathbf{y})$, where $f$ is +nonconvex in $\mathbf{x}$ but concave in $\mathbf{y}$ and $\mathcal{Y}$ is a +convex and bounded set. One of the most popular algorithms for solving this +problem is the celebrated gradient descent ascent (GDA) algorithm, which has +been widely used in machine learning, control theory and economics. Despite the +extensive convergence results for the convex-concave setting, GDA with equal +stepsize can converge to limit cycles or even diverge in a general setting. In +this paper, we present the complexity results on two-time-scale GDA for solving +nonconvex-concave minimax problems, showing that the algorithm can find a +stationary point of the function $\Phi(\cdot) := \max_{\mathbf{y} \in +\mathcal{Y}} f(\cdot, \mathbf{y})$ efficiently. To the best our knowledge, this +is the first nonasymptotic analysis for two-time-scale GDA in this setting, +shedding light on its superior practical performance in training generative +adversarial networks (GANs) and other real applications. + +
+
+ comment: Accepted by ICML 2020; Fix an error in Proposition 4.11 and 4.12 by + modifying Definition 4.10 and some typos +
+
+
+
+
+ + ♻ ☆ TempEE: Temporal-Spatial Parallel Transformer for Radar Echo + Extrapolation Beyond Auto-Regression + + +
+ Meteorological radar reflectivity data (i.e. radar echo) significantly +influences precipitation prediction. It can facilitate accurate and expeditious +forecasting of short-term heavy rainfall bypassing the need for complex +Numerical Weather Prediction (NWP) models. In comparison to conventional +models, Deep Learning (DL)-based radar echo extrapolation algorithms exhibit +higher effectiveness and efficiency. Nevertheless, the development of reliable +and generalized echo extrapolation algorithm is impeded by three primary +challenges: cumulative error spreading, imprecise representation of sparsely +distributed echoes, and inaccurate description of non-stationary motion +processes. To tackle these challenges, this paper proposes a novel radar echo +extrapolation algorithm called Temporal-Spatial Parallel Transformer, referred +to as TempEE. TempEE avoids using auto-regression and instead employs a +one-step forward strategy to prevent cumulative error spreading during the +extrapolation process. Additionally, we propose the incorporation of a +Multi-level Temporal-Spatial Attention mechanism to improve the algorithm's +capability of capturing both global and local information while emphasizing +task-related regions, including sparse echo representations, in an efficient +manner. Furthermore, the algorithm extracts spatio-temporal representations +from continuous echo images using a parallel encoder to model the +non-stationary motion process for echo extrapolation. The superiority of our +TempEE has been demonstrated in the context of the classic radar echo +extrapolation task, utilizing a real-world dataset. Extensive experiments have +further validated the efficacy and indispensability of various components +within TempEE. + +
+
+ comment: Have been accepted by IEEE Transactions on Geoscience and Remote + Sensing, see https://ieeexplore.ieee.org/document/10238744 +
+
+
+
+
+ + ♻ ☆ Neural Categorical Priors for Physics-Based Character Control SIGGRAPH + + +
+ Recent advances in learning reusable motion priors have demonstrated their +effectiveness in generating naturalistic behaviors. In this paper, we propose a +new learning framework in this paradigm for controlling physics-based +characters with significantly improved motion quality and diversity over +existing state-of-the-art methods. The proposed method uses reinforcement +learning (RL) to initially track and imitate life-like movements from +unstructured motion clips using the discrete information bottleneck, as adopted +in the Vector Quantized Variational AutoEncoder (VQ-VAE). This structure +compresses the most relevant information from the motion clips into a compact +yet informative latent space, i.e., a discrete space over vector quantized +codes. By sampling codes in the space from a trained categorical prior +distribution, high-quality life-like behaviors can be generated, similar to the +usage of VQ-VAE in computer vision. Although this prior distribution can be +trained with the supervision of the encoder's output, it follows the original +motion clip distribution in the dataset and could lead to imbalanced behaviors +in our setting. To address the issue, we further propose a technique named +prior shifting to adjust the prior distribution using curiosity-driven RL. The +outcome distribution is demonstrated to offer sufficient behavioral diversity +and significantly facilitates upper-level policy learning for downstream tasks. +We conduct comprehensive experiments using humanoid characters on two +challenging downstream tasks, sword-shield striking and two-player boxing game. +Our results demonstrate that the proposed framework is capable of controlling +the character to perform considerably high-quality movements in terms of +behavioral strategies, diversity, and realism. Videos, codes, and data are +available at https://tencent-roboticsx.github.io/NCP/. + +
+
+ comment: Accepted to Transactions on Graphics (Proc. ACM SIGGRAPH ASIA 2023) +
+
+
+
+
+ + ♻ ☆ Model-free Learning of Regions of Attraction via Recurrent Sets + + +
+ We consider the problem of learning an inner approximation of the region of +attraction (ROA) of an asymptotically stable equilibrium point without an +explicit model of the dynamics. Rather than leveraging approximate models with +bounded uncertainty to find a (robust) invariant set contained in the ROA, we +propose to learn sets that satisfy a more relaxed notion of containment known +as recurrence. We define a set to be $\tau$-recurrent (resp. $k$-recurrent) if +every trajectory that starts within the set, returns to it after at most $\tau$ +seconds (resp. $k$ steps). We show that under mild assumptions a +$\tau$-recurrent set containing a stable equilibrium must be a subset of its +ROA. We then leverage this property to develop algorithms that compute inner +approximations of the ROA using counter-examples of recurrence that are +obtained by sampling finite-length trajectories. Our algorithms process samples +sequentially, which allow them to continue being executed even after an initial +offline training stage. We further provide an upper bound on the number of +counter-examples used by the algorithm, and almost sure convergence guarantees. + +
+
+
+
+
+ + ♻ ☆ Benign Overfitting without Linearity: Neural Network Classifiers Trained + by Gradient Descent for Noisy Linear Data + + +
+ Benign overfitting, the phenomenon where interpolating models generalize well +in the presence of noisy data, was first observed in neural network models +trained with gradient descent. To better understand this empirical observation, +we consider the generalization error of two-layer neural networks trained to +interpolation by gradient descent on the logistic loss following random +initialization. We assume the data comes from well-separated class-conditional +log-concave distributions and allow for a constant fraction of the training +labels to be corrupted by an adversary. We show that in this setting, neural +networks exhibit benign overfitting: they can be driven to zero training error, +perfectly fitting any noisy training labels, and simultaneously achieve minimax +optimal test error. In contrast to previous work on benign overfitting that +require linear or kernel-based predictors, our analysis holds in a setting +where both the model and learning dynamics are fundamentally nonlinear. + +
+
+ comment: 39 pages; minor corrections +
+
+
+
+
+ + ♻ ☆ Random Feature Amplification: Feature Learning and Generalization in + Neural Networks + + +
+ In this work, we provide a characterization of the feature-learning process +in two-layer ReLU networks trained by gradient descent on the logistic loss +following random initialization. We consider data with binary labels that are +generated by an XOR-like function of the input features. We permit a constant +fraction of the training labels to be corrupted by an adversary. We show that, +although linear classifiers are no better than random guessing for the +distribution we consider, two-layer ReLU networks trained by gradient descent +achieve generalization error close to the label noise rate. We develop a novel +proof technique that shows that at initialization, the vast majority of neurons +function as random features that are only weakly correlated with useful +features, and the gradient descent dynamics 'amplify' these weak, random +features to strong, useful features. + +
+
+ comment: 46 pages; JMLR camera ready revision +
+
+
+
+
+ + ♻ ☆ Deep Nonparametric Convexified Filtering for Computational Photography, + Image Synthesis and Adversarial Defense + + +
+ We aim to provide a general framework of for computational photography that +recovers the real scene from imperfect images, via the Deep Nonparametric +Convexified Filtering (DNCF). It is consists of a nonparametric deep network to +resemble the physical equations behind the image formation, such as denoising, +super-resolution, inpainting, and flash. DNCF has no parameterization dependent +on training data, therefore has a strong generalization and robustness to +adversarial image manipulation. During inference, we also encourage the network +parameters to be nonnegative and create a bi-convex function on the input and +parameters, and this adapts to second-order optimization algorithms with +insufficient running time, having 10X acceleration over Deep Image Prior. With +these tools, we empirically verify its capability to defend image +classification deep networks against adversary attack algorithms in real-time. + +
+
+
+
+
+ + ♻ ☆ Uncertainty-aware Traffic Prediction under Missing Data ICDM + + +
+ Traffic prediction is a crucial topic because of its broad scope of +applications in the transportation domain. Recently, various studies have +achieved promising results. However, most studies assume the prediction +locations have complete or at least partial historical records and cannot be +extended to non-historical recorded locations. In real-life scenarios, the +deployment of sensors could be limited due to budget limitations and +installation availability, which makes most current models not applicable. +Though few pieces of literature tried to impute traffic states at the missing +locations, these methods need the data simultaneously observed at the locations +with sensors, making them not applicable to prediction tasks. Another drawback +is the lack of measurement of uncertainty in prediction, making prior works +unsuitable for risk-sensitive tasks or involving decision-making. To fill the +gap, inspired by the previous inductive graph neural network, this work +proposed an uncertainty-aware framework with the ability to 1) extend +prediction to missing locations with no historical records and significantly +extend spatial coverage of prediction locations while reducing deployment of +sensors and 2) generate probabilistic prediction with uncertainty +quantification to help the management of risk and decision making in the +down-stream tasks. Through extensive experiments on real-life datasets, the +result shows our method achieved promising results on prediction tasks, and the +uncertainty quantification gives consistent results which highly correlated +with the locations with and without historical data. We also show that our +model could help support sensor deployment tasks in the transportation field to +achieve higher accuracy with a limited sensor deployment budget. + +
+
+ comment: 11 pages, 3 figures, Accepted as a short paper of IEEE International + Conference on Data Mining (ICDM) 2023 +
+
+
+
+
+ + ♻ ☆ On the complexity of finding a local minimizer of a quadratic function + over a polytope + + +
+ We show that unless P=NP, there cannot be a polynomial-time algorithm that +finds a point within Euclidean distance $c^n$ (for any constant $c \ge 0$) of a +local minimizer of an $n$-variate quadratic function over a polytope. This +result (even with $c=0$) answers a question of Pardalos and Vavasis that +appeared in 1992 on a list of seven open problems in complexity theory for +numerical optimization. Our proof technique also implies that the problem of +deciding whether a quadratic function has a local minimizer over an (unbounded) +polyhedron, and that of deciding if a quartic polynomial has a local minimizer +are NP-hard. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ BAFFLE: Backdoor Attack in Offline Reinforcement Learning + + +
+ A growing body of research has focused on the Reinforcement Learning (RL) +methods which allow the agent to learn from trial-and-error experiences +gathered during the interaction with the environment. Recently, offline RL +becomes a popular RL paradigm because it saves the interactions with +environments. In offline RL, data providers share large pre-collected datasets, +and others can train high-quality agents without interacting with the +environments. This paradigm has demonstrated effectiveness in critical tasks +like robot control, autonomous driving, etc. However, less attention is paid to +investigating the security threats to the offline RL system. This paper focuses +on backdoor attacks, where some perturbations are added to the data +(observations) such that given normal observations, the agent takes +high-rewards actions, and low-reward actions on observations injected with +triggers. In this paper, we propose Baffle (Backdoor Attack for Offline +Reinforcement Learning), an approach that automatically implants backdoors to +RL agents by poisoning the offline RL dataset, and evaluate how different +offline RL algorithms react to this attack. Our experiments conducted on four +tasks and four offline RL algorithms expose a disquieting fact: none of the +existing offline RL algorithms is immune to such a backdoor attack. Baffle +modifies $10\%$ of the datasets for four tasks. Agents trained on the poisoned +datasets perform well in normal settings. However, when triggers are presented, +the agents' performance decreases drastically by $63.2\%$, $53.9\%$, $64.7\%$, +and $47.4\%$ in the four tasks on average. The backdoor still persists after +fine-tuning poisoned agents on clean datasets. We further show that the +inserted backdoor is also hard to be detected by a popular defensive method. +This paper calls attention to developing more effective protection for the +open-source offline RL dataset. + +
+
+ comment: 18 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Hybrid Algorithm Selection and Hyperparameter Tuning on Distributed + Machine Learning Resources: A Hierarchical Agent-based Approach + + +
+ Algorithm selection and hyperparameter tuning are critical steps in both +academic and applied machine learning. On the other hand, these steps are +becoming ever increasingly delicate due to the extensive rise in the number, +diversity, and distributedness of machine learning resources. Multi-agent +systems, when applied to the design of machine learning platforms, bring about +several distinctive characteristics such as scalability, flexibility, and +robustness, just to name a few. This paper proposes a fully automatic and +collaborative agent-based mechanism for selecting distributedly organized +machine learning algorithms and simultaneously tuning their hyperparameters. +Our method builds upon an existing agent-based hierarchical machine-learning +platform and augments its query structure to support the aforementioned +functionalities without being limited to specific learning, selection, and +tuning mechanisms. We have conducted theoretical assessments, formal +verification, and analytical study to demonstrate the correctness, resource +utilization, and computational efficiency of our technique. According to the +results, our solution is totally correct and exhibits linear time and space +complexity in relation to the size of available resources. To provide concrete +examples of how the proposed methodologies can effectively adapt and perform +across a range of algorithmic options and datasets, we have also conducted a +series of experiments using a system comprised of 24 algorithms and 9 datasets. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Usability Evaluation of Spoken Humanoid Embodied Conversational Agents + in Mobile Serious Games + + +
+ This paper presents an empirical investigation of the extent to which spoken +Humanoid Embodied Conversational Agents (HECAs) can foster usability in mobile +serious game (MSG) applications. The aim of the research is to assess the +impact of multiple agents and illusion of humanness on the quality of the +interaction. The experiment investigates two styles of agent presentation: an +agent of high human-likeness (HECA) and an agent of low human-likeness (text). +The purpose of the experiment is to assess whether and how agents of high +humanlikeness can evoke the illusion of humanness and affect usability. Agents +of high human-likeness were designed by following the ECA design model that is +a proposed guide for ECA development. The results of the experiment with 90 +participants show that users prefer to interact with the HECAs. The difference +between the two versions is statistically significant with a large effect size +(d=1.01), with many of the participants justifying their choice by saying that +the human-like characteristics of the HECA made the version more appealing. +This research provides key information on the potential effect of HECAs on +serious games, which can provide insight into the design of future mobile +serious games. + +
+
+ comment: 45 pages, 9 figures, 14 tables +
+
+
+
+
+ + ☆ MPAI-EEV: Standardization Efforts of Artificial Intelligence based + End-to-End Video Coding + + +
+ The rapid advancement of artificial intelligence (AI) technology has led to +the prioritization of standardizing the processing, coding, and transmission of +video using neural networks. To address this priority area, the Moving Picture, +Audio, and Data Coding by Artificial Intelligence (MPAI) group is developing a +suite of standards called MPAI-EEV for "end-to-end optimized neural video +coding." The aim of this AI-based video standard project is to compress the +number of bits required to represent high-fidelity video data by utilizing +data-trained neural coding technologies. This approach is not constrained by +how data coding has traditionally been applied in the context of a hybrid +framework. This paper presents an overview of recent and ongoing +standardization efforts in this area and highlights the key technologies and +design philosophy of EEV. It also provides a comparison and report on some +primary efforts such as the coding efficiency of the reference model. +Additionally, it discusses emerging activities such as learned +Unmanned-Aerial-Vehicles (UAVs) video coding which are currently planned, under +development, or in the exploration phase. With a focus on UAV video signals, +this paper addresses the current status of these preliminary efforts. It also +indicates development timelines, summarizes the main technical details, and +provides pointers to further points of reference. The exploration experiment +shows that the EEV model performs better than the state-of-the-art video coding +standard H.266/VVC in terms of perceptual evaluation metric. + +
+
+
+
+
+ + ☆ VCD: A Video Conferencing Dataset for Video Compression + + +
+ Commonly used datasets for evaluating video codecs are all very high quality +and not representative of video typically used in video conferencing scenarios. +We present the Video Conferencing Dataset (VCD) for evaluating video codecs for +real-time communication, the first such dataset focused on video conferencing. +VCD includes a wide variety of camera qualities and spatial and temporal +information. It includes both desktop and mobile scenarios and two types of +video background processing. We report the compression efficiency of H.264, +H.265, H.266, and AV1 in low-delay settings on VCD and compare it with the +non-video conferencing datasets UVC, MLC-JVC, and HEVC. The results show the +source quality and the scenarios have a significant effect on the compression +efficiency of all the codecs. VCD enables the evaluation and tuning of codecs +for this important scenario. The VCD is publicly available as an open-source +dataset at https://github.com/microsoft/VCD. + +
+
+
+
+
+ + ♻ ☆ Mitigating Hallucination in Large Multi-Modal Models via Robust + Instruction Tuning + + +
+ Despite the promising progress in multi-modal tasks, current large +multi-modal models (LMM) are prone to hallucinating inconsistent descriptions +with respect to the associated image and human instructions. This paper +addresses this issue by introducing the first large and diverse visual +instruction tuning dataset, named Large-scale Robust Visual (LRV)-Instruction. +Our dataset consists of 120k visual instructions generated by GPT4, covering 16 +vision-and-language tasks with open-ended instructions and answers. Unlike +existing studies that primarily focus on positive instruction samples, we +design LRV-Instruction to include both positive and negative instructions for +more robust visual instruction tuning. Our negative instructions are designed +at two semantic levels: (i) Nonexistent Element Manipulation and (ii) Existent +Element Manipulation. To efficiently measure the hallucination generated by +LMMs, we propose GPT4-Assisted Visual Instruction Evaluation (GAVIE), a novel +approach to evaluate visual instruction tuning without the need for +human-annotated groundtruth answers and can adapt to diverse instruction +formats. We conduct comprehensive experiments to investigate the hallucination +of LMMs. Our results demonstrate that existing LMMs exhibit significant +hallucination when presented with our negative instructions, particularly with +Existent Element Manipulation instructions. Moreover, by finetuning MiniGPT4 on +LRV-Instruction, we successfully mitigate hallucination while improving +performance on public datasets using less training data compared to +state-of-the-art methods. Additionally, we observed that a balanced ratio of +positive and negative instances in the training data leads to a more robust +model. Updates of our project are available at +https://fuxiaoliu.github.io/LRV/. + +
+
+ comment: 35 pages, 27 figures. Under Review +
+
+
+
+
+ + ♻ ☆ A Survey on Interpretable Cross-modal Reasoning + + +
+ In recent years, cross-modal reasoning (CMR), the process of understanding +and reasoning across different modalities, has emerged as a pivotal area with +applications spanning from multimedia analysis to healthcare diagnostics. As +the deployment of AI systems becomes more ubiquitous, the demand for +transparency and comprehensibility in these systems' decision-making processes +has intensified. This survey delves into the realm of interpretable cross-modal +reasoning (I-CMR), where the objective is not only to achieve high predictive +performance but also to provide human-understandable explanations for the +results. This survey presents a comprehensive overview of the typical methods +with a three-level taxonomy for I-CMR. Furthermore, this survey reviews the +existing CMR datasets with annotations for explanations. Finally, this survey +summarizes the challenges for I-CMR and discusses potential future directions. +In conclusion, this survey aims to catalyze the progress of this emerging +research area by providing researchers with a panoramic and comprehensive +perspective, illuminating the state of the art and discerning the +opportunities. The summarized methods, datasets, and other resources are +available at +https://github.com/ZuyiZhou/Awesome-Interpretable-Cross-modal-Reasoning. + +
+
+
+
+
+ + ♻ ☆ VEATIC: Video-based Emotion and Affect Tracking in Context Dataset + + +
+ Human affect recognition has been a significant topic in psychophysics and +computer vision. However, the currently published datasets have many +limitations. For example, most datasets contain frames that contain only +information about facial expressions. Due to the limitations of previous +datasets, it is very hard to either understand the mechanisms for affect +recognition of humans or generalize well on common cases for computer vision +models trained on those datasets. In this work, we introduce a brand new large +dataset, the Video-based Emotion and Affect Tracking in Context Dataset +(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has +124 video clips from Hollywood movies, documentaries, and home videos with +continuous valence and arousal ratings of each frame via real-time annotation. +Along with the dataset, we propose a new computer vision task to infer the +affect of the selected character via both context and character information in +each video frame. Additionally, we propose a simple model to benchmark this new +computer vision task. We also compare the performance of the pretrained model +using our dataset with other similar datasets. Experiments show the competing +results of our pretrained model via VEATIC, indicating the generalizability of +VEATIC. Our dataset is available at https://veatic.github.io. + +
+
+
+
+
+ + ♻ ☆ Preventing Unauthorized AI Over-Analysis by Medical Image Adversarial + Watermarking + + +
+ The advancement of deep learning has facilitated the integration of +Artificial Intelligence (AI) into clinical practices, particularly in +computer-aided diagnosis. Given the pivotal role of medical images in various +diagnostic procedures, it becomes imperative to ensure the responsible and +secure utilization of AI techniques. However, the unauthorized utilization of +AI for image analysis raises significant concerns regarding patient privacy and +potential infringement on the proprietary rights of data custodians. +Consequently, the development of pragmatic and cost-effective strategies that +safeguard patient privacy and uphold medical image copyrights emerges as a +critical necessity. In direct response to this pressing demand, we present a +pioneering solution named Medical Image Adversarial watermarking (MIAD-MARK). +Our approach introduces watermarks that strategically mislead unauthorized AI +diagnostic models, inducing erroneous predictions without compromising the +integrity of the visual content. Importantly, our method integrates an +authorization protocol tailored for legitimate users, enabling the removal of +the MIAD-MARK through encryption-generated keys. Through extensive experiments, +we validate the efficacy of MIAD-MARK across three prominent medical image +datasets. The empirical outcomes demonstrate the substantial impact of our +approach, notably reducing the accuracy of standard AI diagnostic models to a +mere 8.57% under white box conditions and 45.83% in the more challenging black +box scenario. Additionally, our solution effectively mitigates unauthorized +exploitation of medical images even in the presence of sophisticated watermark +removal networks. Notably, those AI diagnosis networks exhibit a meager average +accuracy of 38.59% when applied to images protected by MIAD-MARK, underscoring +the robustness of our safeguarding mechanism. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ RAIN: Your Language Models Can Align Themselves without Finetuning + + +
+ Large language models (LLMs) often demonstrate inconsistencies with human +preferences. Previous research gathered human preference data and then aligned +the pre-trained models using reinforcement learning or instruction tuning, the +so-called finetuning step. In contrast, aligning frozen LLMs without any extra +data is more appealing. This work explores the potential of the latter setting. +We discover that by integrating self-evaluation and rewind mechanisms, +unaligned LLMs can directly produce responses consistent with human preferences +via self-boosting. We introduce a novel inference method, Rewindable +Auto-regressive INference (RAIN), that allows pre-trained LLMs to evaluate +their own generation and use the evaluation results to guide backward rewind +and forward generation for AI safety. Notably, RAIN operates without the need +of extra data for model alignment and abstains from any training, gradient +computation, or parameter updates; during the self-evaluation phase, the model +receives guidance on which human preference to align with through a +fixed-template prompt, eliminating the need to modify the initial prompt. +Experimental results evaluated by GPT-4 and humans demonstrate the +effectiveness of RAIN: on the HH dataset, RAIN improves the harmlessness rate +of LLaMA 30B over vanilla inference from 82% to 97%, while maintaining the +helpfulness rate. Under the leading adversarial attack llm-attacks on Vicuna +33B, RAIN establishes a new defense baseline by reducing the attack success +rate from 94% to 19%. + +
+
+
+
+
+ + ☆ Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness + and Ethics + + +
+ Multi-modal large language models (MLLMs) are trained based on large language +models (LLM), with an enhanced capability to comprehend multi-modal inputs and +generate textual responses. While they excel in multi-modal tasks, the pure NLP +abilities of MLLMs are often underestimated and left untested. In this study, +we get out of the box and unveil an intriguing characteristic of MLLMs -- our +preliminary results suggest that visual instruction tuning, a prevailing +strategy for transitioning LLMs into MLLMs, unexpectedly and interestingly +helps models attain both improved truthfulness and ethical alignment in the +pure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model +surpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one +million human annotations, on TruthfulQA-mc and Ethics benchmarks. Further +analysis reveals that the improved alignment can be attributed to the superior +instruction quality inherent to visual-text data. In releasing our code at +github.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration +into the intrinsic value of visual-text synergies and, in a broader scope, +multi-modal interactions in alignment research. + +
+
+
+
+
+ + ☆ Mitigating Hallucinations and Off-target Machine Translation with + Source-Contrastive and Language-Contrastive Decoding + + +
+ Hallucinations and off-target translation remain unsolved problems in machine +translation, especially for low-resource languages and massively multilingual +models. In this paper, we introduce methods to mitigate both failure cases with +a modified decoding objective, without requiring retraining or external models. +In source-contrastive decoding, we search for a translation that is probable +given the correct input, but improbable given a random input segment, +hypothesising that hallucinations will be similarly probable given either. In +language-contrastive decoding, we search for a translation that is probable, +but improbable given the wrong language indicator token. In experiments on +M2M-100 (418M) and SMaLL-100, we find that these methods effectively suppress +hallucinations and off-target translations, improving chrF2 by 1.7 and 1.4 +points on average across 57 tested translation directions. In a proof of +concept on English--German, we also show that we can suppress off-target +translations with the Llama 2 chat models, demonstrating the applicability of +the method to machine translation with LLMs. We release our source code at +https://github.com/ZurichNLP/ContraDecode. + +
+
+
+
+
+ + ☆ Can Whisper perform speech-based in-context learning ICASSP 2024 + + +
+ This paper investigates the in-context learning abilities of the Whisper +automatic speech recognition (ASR) models released by OpenAI. A novel +speech-based in-context learning (SICL) approach is proposed for test-time +adaptation, which can reduce the word error rates (WERs) with only a small +number of labelled speech samples without gradient descent. Language-level +adaptation experiments using Chinese dialects showed that when applying SICL to +isolated word ASR, consistent and considerable relative WER reductions can be +achieved using Whisper models of any size on two dialects, which is on average +32.3%. A k-nearest-neighbours-based in-context example selection technique can +be applied to further improve the efficiency of SICL, which can increase the +average relative WER reduction to 36.4%. The findings are verified using +speaker adaptation or continuous speech recognition tasks, and both achieved +considerable relative WER reductions. Detailed quantitative analyses are also +provided to shed light on SICL's adaptability to phonological variances and +dialect-specific lexical nuances. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ SafetyBench: Evaluating the Safety of Large Language Models with + Multiple Choice Questions + + +
+ With the rapid development of Large Language Models (LLMs), increasing +attention has been paid to their safety concerns. Consequently, evaluating the +safety of LLMs has become an essential task for facilitating the broad +applications of LLMs. Nevertheless, the absence of comprehensive safety +evaluation benchmarks poses a significant impediment to effectively assess and +enhance the safety of LLMs. In this work, we present SafetyBench, a +comprehensive benchmark for evaluating the safety of LLMs, which comprises +11,435 diverse multiple choice questions spanning across 7 distinct categories +of safety concerns. Notably, SafetyBench also incorporates both Chinese and +English data, facilitating the evaluation in both languages. Our extensive +tests over 25 popular Chinese and English LLMs in both zero-shot and few-shot +settings reveal a substantial performance advantage for GPT-4 over its +counterparts, and there is still significant room for improving the safety of +current LLMs. We believe SafetyBench will enable fast and comprehensive +evaluation of LLMs' safety, and foster the development of safer LLMs. Data and +evaluation guidelines are available at https://github.com/thu-coai/SafetyBench. +Submission entrance and leaderboard are available at +https://llmbench.ai/safety. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ How (Not) to Use Sociodemographic Information for Subjective NLP Tasks + + +
+ Annotators' sociodemographic backgrounds (i.e., the individual compositions +of their gender, age, educational background, etc.) have a strong impact on +their decisions when working on subjective NLP tasks, such as hate speech +detection. Often, heterogeneous backgrounds result in high disagreements. To +model this variation, recent work has explored sociodemographic prompting, a +technique, which steers the output of prompt-based models towards answers that +humans with specific sociodemographic profiles would give. However, the +available NLP literature disagrees on the efficacy of this technique -- it +remains unclear, for which tasks and scenarios it can help and evaluations are +limited to specific tasks only. We address this research gap by presenting the +largest and most comprehensive study of sociodemographic prompting today. +Concretely, we evaluate several prompt formulations across seven datasets and +six instruction-tuned model families. We find that (1) while sociodemographic +prompting can be beneficial for improving zero-shot learning in subjective NLP +tasks, (2) its outcomes largely vary for different model types, sizes, and +datasets, (3) are subject to large variance with regards to prompt +formulations. Thus, sociodemographic prompting is not a reliable proxy for +traditional data annotation with a sociodemographically heterogeneous group of +annotators. Instead, we propose (4) to use it for identifying ambiguous +instances resulting in more informed annotation efforts. + +
+
+
+
+
+ + ☆ Beyond original Research Articles Categorization via NLP + + +
+ This work proposes a novel approach to text categorization -- for unknown +categories -- in the context of scientific literature, using Natural Language +Processing techniques. The study leverages the power of pre-trained language +models, specifically SciBERT, to extract meaningful representations of +abstracts from the ArXiv dataset. Text categorization is performed using the +K-Means algorithm, and the optimal number of clusters is determined based on +the Silhouette score. The results demonstrate that the proposed approach +captures subject information more effectively than the traditional arXiv +labeling system, leading to improved text categorization. The approach offers +potential for better navigation and recommendation systems in the rapidly +growing landscape of scientific research literature. + +
+
+ comment: Workshop on Human-in-the-Loop Applied Machine Learning (HITLAML), + 2023 +
+
+
+
+
+ + ☆ Résumé Parsing as Hierarchical Sequence Labeling: An Empirical Study RecSys + + +
+ Extracting information from r\'esum\'es is typically formulated as a +two-stage problem, where the document is first segmented into sections and then +each section is processed individually to extract the target entities. Instead, +we cast the whole problem as sequence labeling in two levels -- lines and +tokens -- and study model architectures for solving both tasks simultaneously. +We build high-quality r\'esum\'e parsing corpora in English, French, Chinese, +Spanish, German, Portuguese, and Swedish. Based on these corpora, we present +experimental results that demonstrate the effectiveness of the proposed models +for the information extraction task, outperforming approaches introduced in +previous work. We conduct an ablation study of the proposed architectures. We +also analyze both model performance and resource efficiency, and describe the +trade-offs for model deployment in the context of a production environment. + +
+
+ comment: RecSys in HR'23: The 3rd Workshop on Recommender Systems for Human + Resources, in conjunction with the 17th ACM Conference on Recommender + Systems, September 18--22, 2023, Singapore, Singapore +
+
+
+
+
+ + ☆ OYXOY: A Modern NLP Test Suite for Modern Greek + + +
+ This paper serves as a foundational step towards the development of a +linguistically motivated and technically relevant evaluation suite for Greek +NLP. We initiate this endeavor by introducing four expert-verified evaluation +tasks, specifically targeted at natural language inference, word sense +disambiguation (through example comparison or sense selection) and metaphor +detection. More than language-adapted replicas of existing tasks, we contribute +two innovations which will resonate with the broader resource and evaluation +community. Firstly, our inference dataset is the first of its kind, marking not +just \textit{one}, but rather \textit{all} possible inference labels, +accounting for possible shifts due to e.g. ambiguity or polysemy. Secondly, we +demonstrate a cost-efficient method to obtain datasets for under-resourced +languages. Using ChatGPT as a language-neutral parser, we transform the +Dictionary of Standard Modern Greek into a structured format, from which we +derive the other three tasks through simple projections. Alongside each task, +we conduct experiments using currently available state of the art machinery. +Our experimental baselines affirm the challenging nature of our tasks and +highlight the need for expedited progress in order for the Greek NLP ecosystem +to keep pace with contemporary mainstream research. + +
+
+
+
+
+ + ☆ Unsupervised Contrast-Consistent Ranking with Language Models + + +
+ Language models contain ranking-based knowledge and are powerful solvers of +in-context ranking tasks. For instance, they may have parametric knowledge +about the ordering of countries by size or may be able to rank reviews by +sentiment. Recent work focuses on pairwise, pointwise, and listwise prompting +techniques to elicit a language model's ranking knowledge. However, we find +that even with careful calibration and constrained decoding, prompting-based +techniques may not always be self-consistent in the rankings they produce. This +motivates us to explore an alternative approach that is inspired by an +unsupervised probing method called Contrast-Consistent Search (CCS). The idea +is to train a probing model guided by a logical constraint: a model's +representation of a statement and its negation must be mapped to contrastive +true-false poles consistently across multiple statements. We hypothesize that +similar constraints apply to ranking tasks where all items are related via +consistent pairwise or listwise comparisons. To this end, we extend the binary +CCS method to Contrast-Consistent Ranking (CCR) by adapting existing ranking +methods such as the Max-Margin Loss, Triplet Loss, and Ordinal Regression +objective. Our results confirm that, for the same language model, CCR probing +outperforms prompting and even performs on a par with prompting much larger +language models. + +
+
+
+
+
+ + ☆ Remote Inference of Cognitive Scores in ALS Patients Using a Picture + Description + + +
+ Amyotrophic lateral sclerosis is a fatal disease that not only affects +movement, speech, and breath but also cognition. Recent studies have focused on +the use of language analysis techniques to detect ALS and infer scales for +monitoring functional progression. In this paper, we focused on another +important aspect, cognitive impairment, which affects 35-50% of the ALS +population. In an effort to reach the ALS population, which frequently exhibits +mobility limitations, we implemented the digital version of the Edinburgh +Cognitive and Behavioral ALS Screen (ECAS) test for the first time. This test +which is designed to measure cognitive impairment was remotely performed by 56 +participants from the EverythingALS Speech Study. As part of the study, +participants (ALS and non-ALS) were asked to describe weekly one picture from a +pool of many pictures with complex scenes displayed on their computer at home. +We analyze the descriptions performed within +/- 60 days from the day the ECAS +test was administered and extract different types of linguistic and acoustic +features. We input those features into linear regression models to infer 5 ECAS +sub-scores and the total score. Speech samples from the picture description are +reliable enough to predict the ECAS subs-scores, achieving statistically +significant Spearman correlation values between 0.32 and 0.51 for the model's +performance using 10-fold cross-validation. + +
+
+ comment: conference paper +
+
+
+
+
+ + ☆ Auto-Regressive Next-Token Predictors are Universal Learners + + +
+ Large language models display remarkable capabilities in logical and +mathematical reasoning, allowing them to solve complex tasks. Interestingly, +these abilities emerge in networks trained on the simple task of next-token +prediction. In this work, we present a theoretical framework for studying +auto-regressive next-token predictors. We demonstrate that even simple models +such as linear next-token predictors, trained on Chain-of-Thought (CoT) data, +can approximate any function efficiently computed by a Turing machine. We +introduce a new complexity measure -- length complexity -- which measures the +number of intermediate tokens in a CoT sequence required to approximate some +target function, and analyze the interplay between length complexity and other +notions of complexity. Finally, we show experimentally that simple next-token +predictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs), +display non-trivial performance on text generation and arithmetic tasks. Our +results demonstrate that the power of language models can be attributed, to a +great extent, to the auto-regressive next-token training scheme, and not +necessarily to a particular choice of architecture. + +
+
+
+
+
+ + ☆ Dynamic Causal Disentanglement Model for Dialogue Emotion Detection + + +
+ Emotion detection is a critical technology extensively employed in diverse +fields. While the incorporation of commonsense knowledge has proven beneficial +for existing emotion detection methods, dialogue-based emotion detection +encounters numerous difficulties and challenges due to human agency and the +variability of dialogue content.In dialogues, human emotions tend to accumulate +in bursts. However, they are often implicitly expressed. This implies that many +genuine emotions remain concealed within a plethora of unrelated words and +dialogues.In this paper, we propose a Dynamic Causal Disentanglement Model +based on hidden variable separation, which is founded on the separation of +hidden variables. This model effectively decomposes the content of dialogues +and investigates the temporal accumulation of emotions, thereby enabling more +precise emotion recognition. First, we introduce a novel Causal Directed +Acyclic Graph (DAG) to establish the correlation between hidden emotional +information and other observed elements. Subsequently, our approach utilizes +pre-extracted personal attributes and utterance topics as guiding factors for +the distribution of hidden variables, aiming to separate irrelevant ones. +Specifically, we propose a dynamic temporal disentanglement model to infer the +propagation of utterances and hidden variables, enabling the accumulation of +emotion-related information throughout the conversation. To guide this +disentanglement process, we leverage the ChatGPT-4.0 and LSTM networks to +extract utterance topics and personal attributes as observed +information.Finally, we test our approach on two popular datasets in dialogue +emotion detection and relevant experimental results verified the model's +superiority. + +
+
+
+
+
+ + ☆ Native Language Identification with Big Bird Embeddings + + +
+ Native Language Identification (NLI) intends to classify an author's native +language based on their writing in another language. Historically, the task has +heavily relied on time-consuming linguistic feature engineering, and +transformer-based NLI models have thus far failed to offer effective, practical +alternatives. The current work investigates if input size is a limiting factor, +and shows that classifiers trained using Big Bird embeddings outperform +linguistic feature engineering models by a large margin on the Reddit-L2 +dataset. Additionally, we provide further insight into input length +dependencies, show consistent out-of-sample performance, and qualitatively +analyze the embedding space. Given the effectiveness and computational +efficiency of this method, we believe it offers a promising avenue for future +NLI work. + +
+
+
+
+
+ + ☆ Continual Learning with Dirichlet Generative-based Rehearsal + + +
+ Recent advancements in data-driven task-oriented dialogue systems (ToDs) +struggle with incremental learning due to computational constraints and +time-consuming issues. Continual Learning (CL) attempts to solve this by +avoiding intensive pre-training, but it faces the problem of catastrophic +forgetting (CF). While generative-based rehearsal CL methods have made +significant strides, generating pseudo samples that accurately reflect the +underlying task-specific distribution is still a challenge. In this paper, we +present Dirichlet Continual Learning (DCL), a novel generative-based rehearsal +strategy for CL. Unlike the traditionally used Gaussian latent variable in the +Conditional Variational Autoencoder (CVAE), DCL leverages the flexibility and +versatility of the Dirichlet distribution to model the latent prior variable. +This enables it to efficiently capture sentence-level features of previous +tasks and effectively guide the generation of pseudo samples. In addition, we +introduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based +knowledge distillation method that enhances knowledge transfer during pseudo +sample generation. Our experiments confirm the efficacy of our approach in both +intent detection and slot-filling tasks, outperforming state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Towards the TopMost: A Topic Modeling System Toolkit + + +
+ Topic models have been proposed for decades with various applications and +recently refreshed by the neural variational inference. However, these topic +models adopt totally distinct dataset, implementation, and evaluation settings, +which hinders their quick utilization and fair comparisons. This greatly +hinders the research progress of topic models. To address these issues, in this +paper we propose a Topic Modeling System Toolkit (TopMost). Compared to +existing toolkits, TopMost stands out by covering a wider range of topic +modeling scenarios including complete lifecycles with dataset pre-processing, +model training, testing, and evaluations. The highly cohesive and decoupled +modular design of TopMost enables quick utilization, fair comparisons, and +flexible extensions of different topic models. This can facilitate the research +and applications of topic models. Our code, tutorials, and documentation are +available at https://github.com/bobxwu/topmost. + +
+
+
+
+
+ + ☆ Gpachov at CheckThat! 2023: A Diverse Multi-Approach Ensemble for + Subjectivity Detection in News Articles + + +
+ The wide-spread use of social networks has given rise to subjective, +misleading, and even false information on the Internet. Thus, subjectivity +detection can play an important role in ensuring the objectiveness and the +quality of a piece of information. This paper presents the solution built by +the Gpachov team for the CLEF-2023 CheckThat! lab Task~2 on subjectivity +detection. Three different research directions are explored. The first one is +based on fine-tuning a sentence embeddings encoder model and dimensionality +reduction. The second one explores a sample-efficient few-shot learning model. +The third one evaluates fine-tuning a multilingual transformer on an altered +dataset, using data from multiple languages. Finally, the three approaches are +combined in a simple majority voting ensemble, resulting in 0.77 macro F1 on +the test set and achieving 2nd place on the English subtask. + +
+
+
+
+
+ + ☆ Comparative Analysis of Contextual Relation Extraction based on Deep + Learning Models + + +
+ Contextual Relation Extraction (CRE) is mainly used for constructing a +knowledge graph with a help of ontology. It performs various tasks such as +semantic search, query answering, and textual entailment. Relation extraction +identifies the entities from raw texts and the relations among them. An +efficient and accurate CRE system is essential for creating domain knowledge in +the biomedical industry. Existing Machine Learning and Natural Language +Processing (NLP) techniques are not suitable to predict complex relations from +sentences that consist of more than two relations and unspecified entities +efficiently. In this work, deep learning techniques have been used to identify +the appropriate semantic relation based on the context from multiple sentences. +Even though various machine learning models have been used for relation +extraction, they provide better results only for binary relations, i.e., +relations occurred exactly between the two entities in a sentence. Machine +learning models are not suited for complex sentences that consist of the words +that have various meanings. To address these issues, hybrid deep learning +models have been used to extract the relations from complex sentence +effectively. This paper explores the analysis of various deep learning models +that are used for relation extraction. + +
+
+ comment: This Paper Presented in the International Conference on FOSS + Approaches towards Computational Intelligence and Language TTechnolog on + February 2023, Thiruvananthapuram +
+
+
+
+
+ + ☆ Cognitive Mirage: A Review of Hallucinations in Large Language Models + + +
+ As large language models continue to develop in the field of AI, text +generation systems are susceptible to a worrisome phenomenon known as +hallucination. In this study, we summarize recent compelling insights into +hallucinations in LLMs. We present a novel taxonomy of hallucinations from +various text generation tasks, thus provide theoretical insights, detection +methods and improvement approaches. Based on this, future research directions +are proposed. Our contribution are threefold: (1) We provide a detailed and +complete taxonomy for hallucinations appearing in text generation tasks; (2) We +provide theoretical analyses of hallucinations in LLMs and provide existing +detection and improvement methods; (3) We propose several research directions +that can be developed in the future. As hallucinations garner significant +attention from the community, we will maintain updates on relevant research +progress. + +
+
+ comment: work in progress; 21 pages +
+
+
+
+
+ + ☆ Scaled Prompt-Tuning for Few-Shot Natural Language Generation + + +
+ The increasingly Large Language Models (LLMs) demonstrate stronger language +understanding and generation capabilities, while the memory demand and +computation cost of fine-tuning LLMs on downstream tasks are non-negligible. +Besides, fine-tuning generally requires a certain amount of data from +individual tasks whilst data collection cost is another issue to consider in +real-world applications. In this work, we focus on Parameter-Efficient +Fine-Tuning (PEFT) methods for few-shot Natural Language Generation (NLG), +which freeze most parameters in LLMs and tune a small subset of parameters in +few-shot cases so that memory footprint, training cost, and labeling cost are +reduced while maintaining or even improving the performance. We propose a +Scaled Prompt-Tuning (SPT) method which surpasses conventional PT with better +performance and generalization ability but without an obvious increase in +training cost. Further study on intermediate SPT suggests the superior +transferability of SPT in few-shot scenarios, providing a recipe for +data-deficient and computation-limited circumstances. Moreover, a comprehensive +comparison of existing PEFT methods reveals that certain approaches exhibiting +decent performance with modest training cost such as Prefix-Tuning in prior +study could struggle in few-shot NLG tasks, especially on challenging datasets. + +
+
+
+
+
+ + ☆ CONVERSER: Few-Shot Conversational Dense Retrieval with Synthetic Data + Generation SIGDIAL 2023 + + +
+ Conversational search provides a natural interface for information retrieval +(IR). Recent approaches have demonstrated promising results in applying dense +retrieval to conversational IR. However, training dense retrievers requires +large amounts of in-domain paired data. This hinders the development of +conversational dense retrievers, as abundant in-domain conversations are +expensive to collect. In this paper, we propose CONVERSER, a framework for +training conversational dense retrievers with at most 6 examples of in-domain +dialogues. Specifically, we utilize the in-context learning capability of large +language models to generate conversational queries given a passage in the +retrieval corpus. Experimental results on conversational retrieval benchmarks +OR-QuAC and TREC CAsT 19 show that the proposed CONVERSER achieves comparable +performance to fully-supervised models, demonstrating the effectiveness of our +proposed framework in few-shot conversational dense retrieval. All source code +and generated datasets are available at https://github.com/MiuLab/CONVERSER + +
+
+ comment: Accepted to SIGDIAL 2023 +
+
+
+
+
+ + ☆ Enhancing Keyphrase Generation by BART Finetuning with Splitting and + Shuffling + + +
+ Keyphrase generation is a task of identifying a set of phrases that best +repre-sent the main topics or themes of a given text. Keyphrases are dividend +int pre-sent and absent keyphrases. Recent approaches utilizing +sequence-to-sequence models show effectiveness on absent keyphrase generation. +However, the per-formance is still limited due to the hardness of finding +absent keyphrases. In this paper, we propose Keyphrase-Focused BART, which +exploits the differ-ences between present and absent keyphrase generations, and +performs fine-tuning of two separate BART models for present and absent +keyphrases. We further show effective approaches of shuffling keyphrases and +candidate keyphrase ranking. For absent keyphrases, our Keyphrase-Focused BART +achieved new state-of-the-art score on F1@5 in two out of five keyphrase +gen-eration benchmark datasets. + +
+
+
+
+
+ + ☆ Simultaneous Machine Translation with Large Language Models + + +
+ Large language models (LLM) have demonstrated their abilities to solve +various natural language processing tasks through dialogue-based interactions. +For instance, research indicates that LLMs can achieve competitive performance +in offline machine translation tasks for high-resource languages. However, +applying LLMs to simultaneous machine translation (SimulMT) poses many +challenges, including issues related to the training-inference mismatch arising +from different decoding patterns. In this paper, we explore the feasibility of +utilizing LLMs for SimulMT. Building upon conventional approaches, we introduce +a simple yet effective mixture policy that enables LLMs to engage in SimulMT +without requiring additional training. Furthermore, after Supervised +Fine-Tuning (SFT) on a mixture of full and prefix sentences, the model exhibits +significant performance improvements. Our experiments, conducted with +Llama2-7B-chat on nine language pairs from the MUST-C dataset, demonstrate that +LLM can achieve translation quality and latency comparable to dedicated SimulMT +models. + +
+
+
+
+
+ + ☆ VLSlice: Interactive Vision-and-Language Slice Discovery ICCV 2023 + + +
+ Recent work in vision-and-language demonstrates that large-scale pretraining +can learn generalizable models that are efficiently transferable to downstream +tasks. While this may improve dataset-scale aggregate metrics, analyzing +performance around hand-crafted subgroups targeting specific bias dimensions +reveals systemic undesirable behaviors. However, this subgroup analysis is +frequently stalled by annotation efforts, which require extensive time and +resources to collect the necessary data. Prior art attempts to automatically +discover subgroups to circumvent these constraints but typically leverages +model behavior on existing task-specific annotations and rapidly degrades on +more complex inputs beyond "tabular" data, none of which study +vision-and-language models. This paper presents VLSlice, an interactive system +enabling user-guided discovery of coherent representation-level subgroups with +consistent visiolinguistic behavior, denoted as vision-and-language slices, +from unlabeled image sets. We show that VLSlice enables users to quickly +generate diverse high-coherency slices in a user study (n=22) and release the +tool publicly. + +
+
+ comment: Conference paper at ICCV 2023. 17 pages, 11 figures. + https://ericslyman.com/vlslice/ +
+
+
+
+
+ + ☆ Benchmarking Procedural Language Understanding for Low-Resource + Languages: A Case Study on Turkish + + +
+ Understanding procedural natural language (e.g., step-by-step instructions) +is a crucial step to execution and planning. However, while there are ample +corpora and downstream tasks available in English, the field lacks such +resources for most languages. To address this gap, we conduct a case study on +Turkish procedural texts. We first expand the number of tutorials in Turkish +wikiHow from 2,000 to 52,000 using automated translation tools, where the +translation quality and loyalty to the original meaning are validated by a team +of experts on a random set. Then, we generate several downstream tasks on the +corpus, such as linking actions, goal inference, and summarization. To tackle +these tasks, we implement strong baseline models via fine-tuning large +language-specific models such as TR-BART and BERTurk, as well as multilingual +models such as mBART, mT5, and XLM. We find that language-specific models +consistently outperform their multilingual models by a significant margin +across most procedural language understanding (PLU) tasks. We release our +corpus, downstream tasks and the baseline models with https://github.com/ +GGLAB-KU/turkish-plu. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Offline Prompt Evaluation and Optimization with Inverse Reinforcement + Learning + + +
+ The recent advances in the development of Large Language Models (LLMs) like +ChatGPT have achieved remarkable performance by leveraging human expertise. +Yet, fully eliciting LLMs' potential for complex tasks requires navigating the +vast search space of natural language prompts. While prompt engineering has +shown promise, the requisite human-crafted prompts in trial-and-error attempts +and the associated costs pose significant challenges. Crucially, the efficiency +of prompt optimization hinges on the costly procedure of prompt evaluation. +This work introduces Prompt-OIRL, an approach rooted in offline inverse +reinforcement learning that seeks to bridge the gap between effective prompt +evaluation and affordability. Our method draws on offline datasets from expert +evaluations, employing Inverse-RL to derive a reward model for offline, +query-dependent prompt evaluations. The advantages of Prompt-OIRL are manifold: +it predicts prompt performance, is cost-efficient, produces human-readable +results, and efficiently navigates the prompt space. We validate our method +across four LLMs and three arithmetic datasets, highlighting its potential as a +robust and effective tool for offline prompt evaluation and optimization. Our +code as well as the offline datasets are released, and we highlight the +Prompt-OIRL can be reproduced within a few hours using a single laptop using +CPU + +
+
+
+
+
+ + ☆ Statistical Rejection Sampling Improves Preference Optimization + + +
+ Improving the alignment of language models with human preferences remains an +active research challenge. Previous approaches have primarily utilized +Reinforcement Learning from Human Feedback (RLHF) via online RL methods such as +Proximal Policy Optimization (PPO). Recently, offline methods such as Sequence +Likelihood Calibration (SLiC) and Direct Preference Optimization (DPO) have +emerged as attractive alternatives, offering improvements in stability and +scalability while maintaining competitive performance. SLiC refines its loss +function using sequence pairs sampled from a supervised fine-tuned (SFT) +policy, while DPO directly optimizes language models based on preference data, +foregoing the need for a separate reward model. However, the maximum likelihood +estimator (MLE) of the target optimal policy requires labeled preference pairs +sampled from that policy. DPO's lack of a reward model constrains its ability +to sample preference pairs from the optimal policy, and SLiC is restricted to +sampling preference pairs only from the SFT policy. To address these +limitations, we introduce a novel approach called Statistical Rejection +Sampling Optimization (RSO) that aims to source preference data from the target +optimal policy using rejection sampling, enabling a more accurate estimation of +the optimal policy. We also propose a unified framework that enhances the loss +functions used in both SLiC and DPO from a preference modeling standpoint. +Through extensive experiments across three diverse tasks, we demonstrate that +RSO consistently outperforms both SLiC and DPO on evaluations from both Large +Language Model (LLM) and human raters. + +
+
+
+
+
+ + ☆ Learning from Auxiliary Sources in Argumentative Revision Classification + + +
+ We develop models to classify desirable reasoning revisions in argumentative +writing. We explore two approaches -- multi-task learning and transfer learning +-- to take advantage of auxiliary sources of revision data for similar tasks. +Results of intrinsic and extrinsic evaluations show that both approaches can +indeed improve classifier performance over baselines. While multi-task learning +shows that training on different sources of data at the same time may improve +performance, transfer-learning better represents the relationship between the +data. + +
+
+
+
+
+ + ☆ Traveling Words: A Geometric Interpretation of Transformers + + +
+ Transformers have significantly advanced the field of natural language +processing, but comprehending their internal mechanisms remains a challenge. In +this paper, we introduce a novel geometric perspective that elucidates the +inner mechanisms of transformer operations. Our primary contribution is +illustrating how layer normalization confines the latent features to a +hyper-sphere, subsequently enabling attention to mold the semantic +representation of words on this surface. This geometric viewpoint seamlessly +connects established properties such as iterative refinement and contextual +embeddings. We validate our insights by probing a pre-trained 124M parameter +GPT-2 model. Our findings reveal clear query-key attention patterns in early +layers and build upon prior observations regarding the subject-specific nature +of attention heads at deeper layers. Harnessing these geometric insights, we +present an intuitive understanding of transformers, depicting them as processes +that model the trajectory of word particles along the hyper-sphere. + +
+
+
+
+
+ + ☆ Sudden Drops in the Loss: Syntax Acquisition, Phase Transitions, and + Simplicity Bias in MLMs + + +
+ Most interpretability research in NLP focuses on understanding the behavior +and features of a fully trained model. However, certain insights into model +behavior may only be accessible by observing the trajectory of the training +process. In this paper, we present a case study of syntax acquisition in masked +language models (MLMs). Our findings demonstrate how analyzing the evolution of +interpretable artifacts throughout training deepens our understanding of +emergent behavior. In particular, we study Syntactic Attention Structure (SAS), +a naturally emerging property of MLMs wherein specific Transformer heads tend +to focus on specific syntactic relations. We identify a brief window in +training when models abruptly acquire SAS and find that this window is +concurrent with a steep drop in loss. Moreover, SAS precipitates the subsequent +acquisition of linguistic capabilities. We then examine the causal role of SAS +by introducing a regularizer to manipulate SAS during training, and demonstrate +that SAS is necessary for the development of grammatical capabilities. We +further find that SAS competes with other beneficial traits and capabilities +during training, and that briefly suppressing SAS can improve model quality. +These findings reveal a real-world example of the relationship between +disadvantageous simplicity bias and interpretable breakthrough training +dynamics. + +
+
+
+
+
+ + ☆ In-Contextual Bias Suppression for Large Language Models + + +
+ Despite their impressive performance in a wide range of NLP tasks, Large +Language Models (LLMs) have been reported to encode worrying-levels of gender +bias. Prior work has proposed debiasing methods that require human labelled +examples, data augmentation and fine-tuning of the LLMs, which are +computationally costly. Moreover, one might not even have access to the +internal parameters for performing debiasing such as in the case of +commercially available LLMs such as GPT-4. To address this challenge we propose +bias suppression, a novel alternative to debiasing that does not require access +to model parameters. We show that text-based preambles, generated from manually +designed templates covering counterfactual statements, can accurately suppress +gender biases in LLMs. Moreover, we find that descriptive sentences for +occupations can further suppress gender biases. Interestingly, we find that +bias suppression has a minimal adverse effect on downstream task performance, +while effectively mitigating the gender biases. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ NExT-GPT: Any-to-Any Multimodal LLM + + +
+ While recently Multimodal Large Language Models (MM-LLMs) have made exciting +strides, they mostly fall prey to the limitation of only input-side multimodal +understanding, without the ability to produce content in multiple modalities. +As we humans always perceive the world and communicate with people through +various modalities, developing any-to-any MM-LLMs capable of accepting and +delivering content in any modality becomes essential to human-level AI. To fill +the gap, we present an end-to-end general-purpose any-to-any MM-LLM system, +NExT-GPT. We connect an LLM with multimodal adaptors and different diffusion +decoders, enabling NExT-GPT to perceive inputs and generate outputs in +arbitrary combinations of text, images, videos, and audio. By leveraging the +existing well-trained highly-performing encoders and decoders, NExT-GPT is +tuned with only a small amount of parameter (1%) of certain projection layers, +which not only benefits low-cost training and also facilitates convenient +expansion to more potential modalities. Moreover, we introduce a +modality-switching instruction tuning (MosIT) and manually curate a +high-quality dataset for MosIT, based on which NExT-GPT is empowered with +complex cross-modal semantic understanding and content generation. Overall, our +research showcases the promising possibility of building an AI agent capable of +modeling universal modalities, paving the way for more human-like AI research +in the community. Project page: https://next-gpt.github.io/ + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Spaiche: Extending State-of-the-Art ASR Models to Swiss German Dialects + + +
+ Recent breakthroughs in NLP largely increased the presence of ASR systems in +our daily lives. However, for many low-resource languages, ASR models still +need to be improved due in part to the difficulty of acquiring pertinent data. +This project aims to help advance research in ASR models for Swiss German +dialects, by providing insights about the performance of state-of-the-art ASR +models on recently published Swiss German speech datasets. We propose a novel +loss that takes into account the semantic distance between the predicted and +the ground-truth labels. We outperform current state-of-the-art results by +fine-tuning OpenAI's Whisper model on Swiss-German datasets. + +
+
+ comment: 8 pages, SwissText conference +
+
+
+
+
+ + ♻ ☆ Stochastic LLMs do not Understand Language: Towards Symbolic, + Explainable and Ontologically Based LLMs + + +
+ In our opinion the exuberance surrounding the relative success of data-driven +large language models (LLMs) is slightly misguided and for several reasons (i) +LLMs cannot be relied upon for factual information since for LLMs all ingested +text (factual or non-factual) was created equal; (ii) due to their subsymbolic +na-ture, whatever 'knowledge' these models acquire about language will always +be buried in billions of microfeatures (weights), none of which is meaningful +on its own; and (iii) LLMs will often fail to make the correct inferences in +several linguistic contexts (e.g., nominal compounds, copredication, quantifier +scope ambi-guities, intensional contexts. Since we believe the relative success +of data-driven large language models (LLMs) is not a reflection on the symbolic +vs. subsymbol-ic debate but a reflection on applying the successful strategy of +a bottom-up reverse engineering of language at scale, we suggest in this paper +applying the effective bottom-up strategy in a symbolic setting resulting in +symbolic, explainable, and ontologically grounded language models. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ A Methodology for Generative Spelling Correction via Natural Spelling + Errors Emulation across Multiple Domains and Languages EACL 2024 + + +
+ Modern large language models demonstrate impressive capabilities in text +generation and generalization. However, they often struggle with solving text +editing tasks, particularly when it comes to correcting spelling errors and +mistypings. In this paper, we present a methodology for generative spelling +correction (SC), which was tested on English and Russian languages and +potentially can be extended to any language with minor changes. Our research +mainly focuses on exploring natural spelling errors and mistypings in texts and +studying the ways those errors can be emulated in correct sentences to +effectively enrich generative models' pre-train procedure. We investigate the +impact of such emulations and the models' abilities across different text +domains. In this work, we investigate two spelling corruption techniques: 1) +first one mimics human behavior when making a mistake through leveraging +statistics of errors from particular dataset and 2) second adds the most common +spelling errors, keyboard miss clicks, and some heuristics within the texts. We +conducted experiments employing various corruption strategies, models' +architectures and sizes on the pre-training and fine-tuning stages and +evaluated the models using single-domain and multi-domain test sets. As a +practical outcome of our work, we introduce SAGE(Spell checking via +Augmentation and Generative distribution Emulation). It is a library for +automatic generative SC that includes a family of pre-trained generative models +and built-in augmentation algorithms. + +
+
+ comment: to appear in EACL 2024 +
+
+
+
+
+ + ♻ ☆ ColD Fusion: Collaborative Descent for Distributed Multitask Finetuning ACL 23 + + +
+ We propose a new paradigm to continually evolve pretrained models, denoted +ColD Fusion. It provides the benefits of multitask learning but leverages +distributed computation with limited communication and eliminates the need for +shared data. Consequentially, ColD Fusion can give rise to a synergistic loop, +where finetuned models can be recycled to continually improve the pretrained +model they are based upon. We show that ColD Fusion yields comparable benefits +to multitask training by producing a model that (a) attains strong performance +on all of the datasets it was trained on; and (b) is a better starting point +for finetuning on unseen datasets. We show that ColD Fusion outperforms RoBERTa +and even previous multitask models. Specifically, when training and testing on +35 diverse datasets, ColD Fusion-based model outperforms RoBERTa by 2.33 points +on average without any changes to the architecture. + +
+
+ comment: ACL 23 +
+
+
+
+
+ + ♻ ☆ Event and Entity Extraction from Generated Video Captions + + +
+ Annotation of multimedia data by humans is time-consuming and costly, while +reliable automatic generation of semantic metadata is a major challenge. We +propose a framework to extract semantic metadata from automatically generated +video captions. As metadata, we consider entities, the entities' properties, +relations between entities, and the video category. We employ two +state-of-the-art dense video captioning models with masked transformer (MT) and +parallel decoding (PVDC) to generate captions for videos of the ActivityNet +Captions dataset. Our experiments show that it is possible to extract entities, +their properties, relations between entities, and the video category from the +generated captions. We observe that the quality of the extracted information is +mainly influenced by the quality of the event localization in the video as well +as the performance of the event caption generation. + +
+
+ comment: Paper accepted at CD-MAKE 2023 +
+
+
+
+
+ + ♻ ☆ GRDD: A Dataset for Greek Dialectal NLP + + +
+ In this paper, we present a dataset for the computational study of a number +of Modern Greek dialects. It consists of raw text data from four dialects of +Modern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is +of considerable size, albeit imbalanced, and presents the first attempt to +create large scale dialectal resources of this type for Modern Greek dialects. +We then use the dataset to perform dialect idefntification. We experiment with +traditional ML algorithms, as well as simple DL architectures. The results show +very good performance on the task, potentially revealing that the dialects in +question have distinct enough characteristics allowing even simple ML models to +perform well on the task. Error analysis is performed for the top performing +algorithms showing that in a number of cases the errors are due to insufficient +dataset cleaning. + +
+
+
+
+
+ + ♻ ☆ Do Language Models Know When They're Hallucinating References? + + +
+ State-of-the-art language models (LMs) are famous for "hallucinating" +references. These fabricated article and book titles lead to harms, obstacles +to their use, and public backlash. While other types of LM hallucinations are +also important, we propose hallucinated references as the "drosophila" of +research on hallucination in large language models (LLMs), as they are +particularly easy to study. We show that simple search engine queries reliably +identify such hallucinations, which facilitates evaluation. To begin to dissect +the nature of hallucinated LM references, we attempt to classify them using +black-box queries to the same LM, without consulting any external resources. +Consistency checks done with "direct" queries about whether the generated +reference title is real (inspired by Kadavath et al. 2022, Lin et al. 2022, +Manakul et al. 2023) are compared to consistency checks with "indirect" queries +which ask for ancillary details such as the authors of the work. These +consistency checks are found to be partially reliable indicators of whether or +not the reference is a hallucination. In particular, we find that LMs often +hallucinate differing authors of hallucinated references when queried in +independent sessions, while consistently identify authors of real references. +This suggests that the hallucination may be more a generation issue than +inherent to current training techniques or representation. + +
+
+
+
+
+ + ♻ ☆ ChatRule: Mining Logical Rules with Large Language Models for Knowledge + Graph Reasoning + + +
+ Logical rules are essential for uncovering the logical connections between +relations, which could improve the reasoning performance and provide +interpretable results on knowledge graphs (KGs). Although there have been many +efforts to mine meaningful logical rules over KGs, existing methods suffer from +the computationally intensive searches over the rule space and a lack of +scalability for large-scale KGs. Besides, they often ignore the semantics of +relations which is crucial for uncovering logical connections. Recently, large +language models (LLMs) have shown impressive performance in the field of +natural language processing and various applications, owing to their emergent +ability and generalizability. In this paper, we propose a novel framework, +ChatRule, unleashing the power of large language models for mining logical +rules over knowledge graphs. Specifically, the framework is initiated with an +LLM-based rule generator, leveraging both the semantic and structural +information of KGs to prompt LLMs to generate logical rules. To refine the +generated rules, a rule ranking module estimates the rule quality by +incorporating facts from existing KGs. Last, a rule validator harnesses the +reasoning ability of LLMs to validate the logical correctness of ranked rules +through chain-of-thought reasoning. ChatRule is evaluated on four large-scale +KGs, w.r.t. different rule quality metrics and downstream tasks, showing the +effectiveness and scalability of our method. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Comprehensive Overview of Large Language Models + + +
+ Large Language Models (LLMs) have recently demonstrated remarkable +capabilities in natural language processing tasks and beyond. This success of +LLMs has led to a large influx of research contributions in this direction. +These works encompass diverse topics such as architectural innovations of the +underlying neural networks, context length improvements, model alignment, +training datasets, benchmarking, efficiency and more. With the rapid +development of techniques and regular breakthroughs in LLM research, it has +become considerably challenging to perceive the bigger picture of the advances +in this direction. Considering the rapidly emerging plethora of literature on +LLMs, it is imperative that the research community is able to benefit from a +concise yet comprehensive overview of the recent developments in this field. +This article provides that overview to the research community. It not only +focuses on a systematic treatment of the existing literature on a broad range +of LLM related concept, but also pays special attention to providing +comprehensive summaries with extensive details about the individual existing +models, datasets and major insights. We also pay heed to aligning our overview +with the emerging outlook of this research direction by accounting for the +other recently materializing reviews of the broader research direction of LLMs. +Our self-contained comprehensive overview of LLMs discusses relevant background +concepts along with covering the advanced topics at the frontier of this +research direction. This review article is intended to not only provide a +systematic survey, but also a quick comprehensive reference for the researchers +and practitioners to draw insights from extensive informative summaries of the +existing works to advance the LLM research direction. + +
+
+ comment: Work in-progress +
+
+
+
+
+ + ♻ ☆ Does ChatGPT have Theory of Mind? + + +
+ Theory of Mind (ToM) is the ability to understand human thinking and +decision-making, an ability that plays a crucial role in social interaction +between people, including linguistic communication. This paper investigates to +what extent recent Large Language Models in the ChatGPT tradition possess ToM. +We posed six well-known problems that address biases in human reasoning and +decision making to two versions of ChatGPT and we compared the results under a +range of prompting strategies. While the results concerning ChatGPT-3 were +somewhat inconclusive, ChatGPT-4 was shown to arrive at the correct answers +more often than would be expected based on chance, although correct answers +were often arrived at on the basis of false assumptions or invalid reasoning. + +
+
+
+
+
+ + ♻ ☆ CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency + Model ACM MM 2023 + + +
+ Denoising diffusion probabilistic models (DDPMs) have shown promising +performance for speech synthesis. However, a large number of iterative steps +are required to achieve high sample quality, which restricts the inference +speed. Maintaining sample quality while increasing sampling speed has become a +challenging task. In this paper, we propose a "Co"nsistency "Mo"del-based +"Speech" synthesis method, CoMoSpeech, which achieve speech synthesis through a +single diffusion sampling step while achieving high audio quality. The +consistency constraint is applied to distill a consistency model from a +well-designed diffusion-based teacher model, which ultimately yields superior +performances in the distilled CoMoSpeech. Our experiments show that by +generating audio recordings by a single sampling step, the CoMoSpeech achieves +an inference speed more than 150 times faster than real-time on a single NVIDIA +A100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based +speech synthesis truly practical. Meanwhile, objective and subjective +evaluations on text-to-speech and singing voice synthesis show that the +proposed teacher models yield the best audio quality, and the one-step sampling +based CoMoSpeech achieves the best inference speed with better or comparable +audio quality to other conventional multi-step diffusion model baselines. Audio +samples are available at https://comospeech.github.io/. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Cross-corpus Readability Compatibility Assessment for English Texts + + +
+ Text readability assessment has gained significant attention from researchers +in various domains. However, the lack of exploration into corpus compatibility +poses a challenge as different research groups utilize different corpora. In +this study, we propose a novel evaluation framework, Cross-corpus text +Readability Compatibility Assessment (CRCA), to address this issue. The +framework encompasses three key components: (1) Corpus: CEFR, CLEC, CLOTH, NES, +OSP, and RACE. Linguistic features, GloVe word vector representations, and +their fusion features were extracted. (2) Classification models: Machine +learning methods (XGBoost, SVM) and deep learning methods (BiLSTM, +Attention-BiLSTM) were employed. (3) Compatibility metrics: RJSD, RRNSS, and +NDCG metrics. Our findings revealed: (1) Validated corpus compatibility, with +OSP standing out as significantly different from other datasets. (2) An +adaptation effect among corpora, feature representations, and classification +methods. (3) Consistent outcomes across the three metrics, validating the +robustness of the compatibility assessment framework. The outcomes of this +study offer valuable insights into corpus selection, feature representation, +and classification methods, and it can also serve as a beginning effort for +cross-corpus transfer learning. + +
+
+ comment: 14 pages,17 figures +
+
+
+
+
+ + ♻ ☆ Diminished Diversity-of-Thought in a Standard Large Language Model + + +
+ We test whether Large Language Models (LLMs) can be used to simulate human +participants in social-science studies. To do this, we run replications of 14 +studies from the Many Labs 2 replication project with OpenAI's text-davinci-003 +model, colloquially known as GPT3.5. Based on our pre-registered analyses, we +find that among the eight studies we could analyse, our GPT sample replicated +37.5% of the original results and 37.5% of the Many Labs 2 results. However, we +were unable to analyse the remaining six studies due to an unexpected +phenomenon we call the "correct answer" effect. Different runs of GPT3.5 +answered nuanced questions probing political orientation, economic preference, +judgement, and moral philosophy with zero or near-zero variation in responses: +with the supposedly "correct answer." In one exploratory follow-up study, we +found that a "correct answer" was robust to changing the demographic details +that precede the prompt. In another, we found that most but not all "correct +answers" were robust to changing the order of answer choices. One of our most +striking findings occurred in our replication of the Moral Foundations Theory +survey results, where we found GPT3.5 identifying as a political conservative +in 99.6% of the cases, and as a liberal in 99.3% of the cases in the +reverse-order condition. However, both self-reported 'GPT conservatives' and +'GPT liberals' showed right-leaning moral foundations. Our results cast doubts +on the validity of using LLMs as a general replacement for human participants +in the social sciences. Our results also raise concerns that a hypothetical +AI-led future may be subject to a diminished diversity-of-thought. + +
+
+ comment: 67 pages (42-page main text, 25-page SI); 12 visualizations (four + tables and three figures in the main text, five figures in the SI); + additional exploratory follow-up study varied the demographic details + preceding the prompt; preregistered OSF database is available at + https://osf.io/dzp8t/ +
+
+
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Accurate Speech Emotion Recognition + + +
+ Contrastive cross-modality pretraining has recently exhibited impressive +success in diverse fields, whereas there is limited research on their merits in +speech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind +of gender-attribute-enhanced contrastive language-audio pretraining (CLAP) +method for SER. Specifically, we first construct an effective emotion CLAP +(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given +the significance of gender information in SER, two novel multi-task learning +based GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP) +models are further proposed to incorporate gender information of speech +signals, forming more reasonable objectives. Experiments on IEMOCAP indicate +that our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with +different pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP +obtains the best UAR of 81.43% and WAR of 83.16%, which performs better than +state-of-the-art SER methods by at least 3%. Our system is open-sourced on +Github. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Scaling Relationship on Learning Mathematical Reasoning with Large + Language Models + + +
+ Mathematical reasoning is a challenging task for large language models +(LLMs), while the scaling relationship of it with respect to LLM capacity is +under-explored. In this paper, we investigate how the pre-training loss, +supervised data amount, and augmented data amount influence the reasoning +performances of a supervised LLM. We find that pre-training loss is a better +indicator of the model's performance than the model's parameter count. We apply +supervised fine-tuning (SFT) with different amounts of supervised data and +empirically find a log-linear relation between data amount and model +performance, and we find better models improve less with enlarged supervised +datasets. To augment more data samples for improving model performances without +any human effort, we propose to apply Rejection sampling Fine-Tuning (RFT). RFT +uses supervised models to generate and collect correct reasoning paths as +augmented fine-tuning datasets. We find with augmented samples containing more +distinct reasoning paths, RFT improves mathematical reasoning performance more +for LLMs. We also find RFT brings more improvement for less performant LLMs. +Furthermore, we combine rejection samples from multiple models which push +LLaMA-7B to an accuracy of 49.3\% on GSM8K which outperforms the supervised +fine-tuning (SFT) accuracy of 35.9\% significantly. + +
+
+ comment: Working in Progress +
+
+
+
+
+ + ♻ ☆ AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity + Recognition and Linking + + +
+ This paper presents a novel approach to address the Entity Recognition and +Linking Challenge at NLPCC 2015. The task involves extracting named entity +mentions from short search queries and linking them to entities within a +reference Chinese knowledge base. To tackle this problem, we first expand the +existing knowledge base and utilize external knowledge to identify candidate +entities, thereby improving the recall rate. Next, we extract features from the +candidate entities and utilize Support Vector Regression and Multiple Additive +Regression Tree as scoring functions to filter the results. Additionally, we +apply rules to further refine the results and enhance precision. Our method is +computationally efficient and achieves an F1 score of 0.535. + +
+
+
+
+
+ + ♻ ☆ SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and + Effective Hotword Customization Ability ICASSP2024 + + +
+ Hotword customization is one of the concerned issues remained in ASR field - +it is of value to enable users of ASR systems to customize names of entities, +persons and other phrases to obtain better experience. The past few years have +seen effective modeling strategies for ASR contextualization developed, but +they still exhibit space for improvement about training stability and the +invisible activation process. In this paper we propose Semantic-Augmented +Contextual-Paraformer (SeACo-Paraformer) a novel NAR based ASR system with +flexible and effective hotword customization ability. It possesses the +advantages of AED-based model's accuracy, NAR model's efficiency, and explicit +customization capacity of superior performance. Through extensive experiments +with 50,000 hours of industrial big data, our proposed model outperforms strong +baselines in customization. Besides, we explore an efficient way to filter +large-scale incoming hotwords for further improvement. The industrial models +compared, source codes and two hotword test sets are all open source. + +
+
+ comment: submitted to ICASSP2024 +
+
+
+
+
+ + ♻ ☆ Understanding the Impact of Post-Training Quantization on Large Language + Models + + +
+ Large language models (LLMs) are rapidly increasing in size, with the number +of parameters becoming a key factor in the success of many commercial models, +such as ChatGPT, Claude, and Bard. Even the recently released publicly +accessible models for commercial usage, such as Falcon and Llama2, come +equipped with billions of parameters. This significant increase in the number +of parameters makes deployment and operation very costly. The remarkable +progress in the field of quantization for large neural networks in general and +LLMs in particular, has made these models more accessible by enabling them to +be deployed on consumer-grade GPUs. Quantized models generally demonstrate +comparable performance levels to their unquantized base counterparts. +Nonetheless, there exists a notable gap in our comprehensive understanding of +how these quantized models respond to hyperparameters, such as temperature, max +new tokens, and topk, particularly for next word prediction. The present +analysis reveals that nf4 and fp4 are equally proficient 4-bit quantization +techniques, characterized by similar attributes such as inference speed, memory +consumption, and the quality of generated content. Nevertheless, these +quantization methods exhibit distinct behaviors at varying temperature +settings, both in the context of smaller and larger models. It is noteworthy +that, in general, 4-bit quantized models of varying sizes exhibit heightened +sensitivity to lower temperature settings, unlike their unquantized +counterparts. Additionally, int8 quantization is associated with significantly +slower inference speeds, whereas unquantized fp16 models consistently yield the +fastest inference speeds across models of all sizes. + +
+
+
+
+
+ + ♻ ☆ Personality Traits in Large Language Models + + +
+ The advent of large language models (LLMs) has revolutionized natural +language processing, enabling the generation of coherent and contextually +relevant human-like text. As LLMs increasingly power conversational agents used +by the general public world-wide, the synthetic personality embedded in these +models, by virtue of training on large amounts of human data, is becoming +increasingly important. Since personality is a key factor determining the +effectiveness of communication, we present a comprehensive method for +administering and validating personality tests on widely-used LLMs, as well as +for shaping personality in the generated text of such LLMs. Applying this +method, we found: 1) personality measurements in the outputs of some LLMs under +specific prompting configurations are reliable and valid; 2) evidence of +reliability and validity of synthetic LLM personality is stronger for larger +and instruction fine-tuned models; and 3) personality in LLM outputs can be +shaped along desired dimensions to mimic specific human personality profiles. +We discuss application and ethical implications of the measurement and shaping +method, in particular regarding responsible AI. + +
+
+
+
+
+ + ♻ ☆ PaLM 2 Technical Report + + +
+ We introduce PaLM 2, a new state-of-the-art language model that has better +multilingual and reasoning capabilities and is more compute-efficient than its +predecessor PaLM. PaLM 2 is a Transformer-based model trained using a mixture +of objectives. Through extensive evaluations on English and multilingual +language, and reasoning tasks, we demonstrate that PaLM 2 has significantly +improved quality on downstream tasks across different model sizes, while +simultaneously exhibiting faster and more efficient inference compared to PaLM. +This improved efficiency enables broader deployment while also allowing the +model to respond faster, for a more natural pace of interaction. PaLM 2 +demonstrates robust reasoning capabilities exemplified by large improvements +over PaLM on BIG-Bench and other reasoning tasks. PaLM 2 exhibits stable +performance on a suite of responsible AI evaluations, and enables +inference-time control over toxicity without additional overhead or impact on +other capabilities. Overall, PaLM 2 achieves state-of-the-art performance +across a diverse set of tasks and capabilities. + When discussing the PaLM 2 family, it is important to distinguish between +pre-trained models (of various sizes), fine-tuned variants of these models, and +the user-facing products that use these models. In particular, user-facing +products typically include additional pre- and post-processing steps. +Additionally, the underlying models may evolve over time. Therefore, one should +not expect the performance of user-facing products to exactly match the results +reported in this report. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Learning with Minimum Instruction to Extract Social + Determinants and Family History from Clinical Notes using GPT Model + + +
+ Demographics, Social determinants of health, and family history documented in +the unstructured text within the electronic health records are increasingly +being studied to understand how this information can be utilized with the +structured data to improve healthcare outcomes. After the GPT models were +released, many studies have applied GPT models to extract this information from +the narrative clinical notes. Different from the existing work, our research +focuses on investigating the zero-shot learning on extracting this information +together by providing minimum information to the GPT model. We utilize +de-identified real-world clinical notes annotated for demographics, various +social determinants, and family history information. Given that the GPT model +might provide text different from the text in the original data, we explore two +sets of evaluation metrics, including the traditional NER evaluation metrics +and semantic similarity evaluation metrics, to completely understand the +performance. Our results show that the GPT-3.5 method achieved an average of +0.975 F1 on demographics extraction, 0.615 F1 on social determinants +extraction, and 0.722 F1 on family history extraction. We believe these results +can be further improved through model fine-tuning or few-shots learning. +Through the case studies, we also identified the limitations of the GPT models, +which need to be addressed in future research. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ K2: A Foundation Language Model for Geoscience Knowledge Understanding + and Utilization + + +
+ Large language models (LLMs) have achieved great success in general domains +of natural language processing. In this paper, we bring LLMs to the realm of +geoscience with the objective of advancing research and applications in this +field. To this end, we present the first-ever LLM in geoscience, K2, alongside +a suite of resources developed to further promote LLM research within +geoscience. For instance, we have curated the first geoscience instruction +tuning dataset, GeoSignal, which aims to align LLM responses to +geoscience-related user queries. Additionally, we have established the first +geoscience benchmark, GeoBench, to evaluate LLMs in the context of geoscience. +In this work, we experiment with a complete recipe to adapt a pre-trained +general-domain LLM to the geoscience domain. Specifically, we further train the +LLaMA-7B model on 5.5B tokens of geoscience text corpus, including over 1 +million pieces of geoscience literature, and utilize GeoSignal's supervised +data to fine-tune the model. Moreover, we share a protocol that can efficiently +gather domain-specific data and construct domain-supervised data, even in +situations where manpower is scarce. Meanwhile, we equip K2 with the abilities +of using tools to be a naive geoscience aide. Experiments conducted on the +GeoBench demonstrate the effectiveness of our approach and datasets on +geoscience knowledge understanding and utilization.We open-source all the +training data and K2 model checkpoints at https://github.com/davendw49/k2. + +
+
+
+
+
+ + ♻ ☆ A Latent Space Theory for Emergent Abilities in Large Language Models + + +
+ Languages are not created randomly but rather to communicate information. +There is a strong association between languages and their underlying meanings, +resulting in a sparse joint distribution that is heavily peaked according to +their correlations. Moreover, these peak values happen to match with the +marginal distribution of languages due to the sparsity. With the advent of LLMs +trained on big data and large models, we can now precisely assess the marginal +distribution of languages, providing a convenient means of exploring the sparse +structures in the joint distribution for effective inferences. In this paper, +we categorize languages as either unambiguous or {\epsilon}-ambiguous and +present quantitative results to demonstrate that the emergent abilities of +LLMs, such as language understanding, in-context learning, chain-of-thought +prompting, and effective instruction fine-tuning, can all be attributed to +Bayesian inference on the sparse joint distribution of languages. + +
+
+ comment: 17 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Halo: Estimation and Reduction of Hallucinations in Open-Source Weak + Large Language Models + + +
+ Large Language Models (LLMs) have revolutionized Natural Language Processing +(NLP). Although convenient for research and practical applications, open-source +LLMs with fewer parameters often suffer from severe hallucinations compared to +their larger counterparts. This paper focuses on measuring and reducing +hallucinations in BLOOM 7B, a representative of such weaker open-source LLMs +that are publicly available for research and commercial applications. We +introduce HaloCheck, a lightweight BlackBox knowledge-free framework designed +to quantify the severity of hallucinations in LLMs. Additionally, we explore +techniques like knowledge injection and teacher-student approaches to alleviate +hallucinations in low-parameter LLMs. Our experiments effectively demonstrate +the reduction of hallucinations in challenging domains for these LLMs. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 105 + +
+
+
+ + ☆ Text-Guided Generation and Editing of Compositional 3D Avatars + + +
+ Our goal is to create a realistic 3D facial avatar with hair and accessories +using only a text description. While this challenge has attracted significant +recent interest, existing methods either lack realism, produce unrealistic +shapes, or do not support editing, such as modifications to the hairstyle. We +argue that existing methods are limited because they employ a monolithic +modeling approach, using a single representation for the head, face, hair, and +accessories. Our observation is that the hair and face, for example, have very +different structural qualities that benefit from different representations. +Building on this insight, we generate avatars with a compositional model, in +which the head, face, and upper body are represented with traditional 3D +meshes, and the hair, clothing, and accessories with neural radiance fields +(NeRF). The model-based mesh representation provides a strong geometric prior +for the face region, improving realism while enabling editing of the person's +appearance. By using NeRFs to represent the remaining components, our method is +able to model and synthesize parts with complex geometry and appearance, such +as curly hair and fluffy scarves. Our novel system synthesizes these +high-quality compositional avatars from text descriptions. The experimental +results demonstrate that our method, Text-guided generation and Editing of +Compositional Avatars (TECA), produces avatars that are more realistic than +those of recent methods while being editable because of their compositional +nature. For example, our TECA enables the seamless transfer of compositional +features like hairstyles, scarves, and other accessories between avatars. This +capability supports applications such as virtual try-on. + +
+
+ comment: Home page: https://yfeng95.github.io/teca +
+
+
+
+
+ + ☆ Tree-Structured Shading Decomposition ICCV 2023 + + +
+ We study inferring a tree-structured representation from a single image for +object shading. Prior work typically uses the parametric or measured +representation to model shading, which is neither interpretable nor easily +editable. We propose using the shade tree representation, which combines basic +shading nodes and compositing methods to factorize object surface shading. The +shade tree representation enables novice users who are unfamiliar with the +physical shading process to edit object shading in an efficient and intuitive +manner. A main challenge in inferring the shade tree is that the inference +problem involves both the discrete tree structure and the continuous parameters +of the tree nodes. We propose a hybrid approach to address this issue. We +introduce an auto-regressive inference model to generate a rough estimation of +the tree structure and node parameters, and then we fine-tune the inferred +shade tree through an optimization algorithm. We show experiments on synthetic +images, captured reflectance, real images, and non-realistic vector drawings, +allowing downstream applications such as material editing, vectorized shading, +and relighting. Project website: https://chen-geng.com/inv-shade-trees + +
+
+ comment: Accepted at ICCV 2023. Project website: + https://chen-geng.com/inv-shade-trees +
+
+
+
+
+ + ☆ Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness + and Ethics + + +
+ Multi-modal large language models (MLLMs) are trained based on large language +models (LLM), with an enhanced capability to comprehend multi-modal inputs and +generate textual responses. While they excel in multi-modal tasks, the pure NLP +abilities of MLLMs are often underestimated and left untested. In this study, +we get out of the box and unveil an intriguing characteristic of MLLMs -- our +preliminary results suggest that visual instruction tuning, a prevailing +strategy for transitioning LLMs into MLLMs, unexpectedly and interestingly +helps models attain both improved truthfulness and ethical alignment in the +pure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model +surpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one +million human annotations, on TruthfulQA-mc and Ethics benchmarks. Further +analysis reveals that the improved alignment can be attributed to the superior +instruction quality inherent to visual-text data. In releasing our code at +github.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration +into the intrinsic value of visual-text synergies and, in a broader scope, +multi-modal interactions in alignment research. + +
+
+
+
+
+ + ☆ PILOT: A Pre-Trained Model-Based Continual Learning Toolbox + + +
+ While traditional machine learning can effectively tackle a wide range of +problems, it primarily operates within a closed-world setting, which presents +limitations when dealing with streaming data. As a solution, incremental +learning emerges to address real-world scenarios involving new data's arrival. +Recently, pre-training has made significant advancements and garnered the +attention of numerous researchers. The strong performance of these pre-trained +models (PTMs) presents a promising avenue for developing continual learning +algorithms that can effectively adapt to real-world scenarios. Consequently, +exploring the utilization of PTMs in incremental learning has become essential. +This paper introduces a pre-trained model-based continual learning toolbox +known as PILOT. On the one hand, PILOT implements some state-of-the-art +class-incremental learning algorithms based on pre-trained models, such as L2P, +DualPrompt, and CODA-Prompt. On the other hand, PILOT also fits typical +class-incremental learning algorithms (e.g., DER, FOSTER, and MEMO) within the +context of pre-trained models to evaluate their effectiveness. + +
+
+ comment: Code is available at https://github.com/sun-hailong/LAMDA-PILOT +
+
+
+
+
+ + ☆ Weakly-Supervised Multi-Task Learning for Audio-Visual Speaker + Verification + + +
+ In this paper, we present a methodology for achieving robust multimodal +person representations optimized for open-set audio-visual speaker +verification. Distance Metric Learning (DML) approaches have typically +dominated this problem space, owing to strong performance on new and unseen +classes. In our work, we explored multitask learning techniques to further +boost performance of the DML approach and show that an auxiliary task with weak +labels can increase the compactness of the learned speaker representation. We +also extend the Generalized end-to-end loss (GE2E) to multimodal inputs and +demonstrate that it can achieve competitive performance in an audio-visual +space. Finally, we introduce a non-synchronous audio-visual sampling random +strategy during training time that has shown to improve generalization. Our +network achieves state of the art performance for speaker verification, +reporting 0.244%, 0.252%, 0.441% Equal Error Rate (EER) on the three official +trial lists of VoxCeleb1-O/E/H, which is to our knowledge, the best published +results on VoxCeleb1-E and VoxCeleb1-H. + +
+
+
+
+
+ + ☆ Contrastive Deep Encoding Enables Uncertainty-aware + Machine-learning-assisted Histopathology + + +
+ Deep neural network models can learn clinically relevant features from +millions of histopathology images. However generating high-quality annotations +to train such models for each hospital, each cancer type, and each diagnostic +task is prohibitively laborious. On the other hand, terabytes of training data +-- while lacking reliable annotations -- are readily available in the public +domain in some cases. In this work, we explore how these large datasets can be +consciously utilized to pre-train deep networks to encode informative +representations. We then fine-tune our pre-trained models on a fraction of +annotated training data to perform specific downstream tasks. We show that our +approach can reach the state-of-the-art (SOTA) for patch-level classification +with only 1-10% randomly selected annotations compared to other SOTA +approaches. Moreover, we propose an uncertainty-aware loss function, to +quantify the model confidence during inference. Quantified uncertainty helps +experts select the best instances to label for further training. Our +uncertainty-aware labeling reaches the SOTA with significantly fewer +annotations compared to random labeling. Last, we demonstrate how our +pre-trained encoders can surpass current SOTA for whole-slide image +classification with weak supervision. Our work lays the foundation for data and +task-agnostic pre-trained deep networks with quantified uncertainty. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ Hardening RGB-D Object Recognition Systems against Adversarial Patch + Attacks + + +
+ RGB-D object recognition systems improve their predictive performances by +fusing color and depth information, outperforming neural network architectures +that rely solely on colors. While RGB-D systems are expected to be more robust +to adversarial examples than RGB-only systems, they have also been proven to be +highly vulnerable. Their robustness is similar even when the adversarial +examples are generated by altering only the original images' colors. Different +works highlighted the vulnerability of RGB-D systems; however, there is a +lacking of technical explanations for this weakness. Hence, in our work, we +bridge this gap by investigating the learned deep representation of RGB-D +systems, discovering that color features make the function learned by the +network more complex and, thus, more sensitive to small perturbations. To +mitigate this problem, we propose a defense based on a detection mechanism that +makes RGB-D systems more robust against adversarial examples. We empirically +show that this defense improves the performances of RGB-D systems against +adversarial examples even when they are computed ad-hoc to circumvent this +detection mechanism, and that is also more effective than adversarial training. + +
+
+ comment: Accepted for publication in the Information Sciences journal +
+
+
+
+
+ + ☆ Polygon Intersection-over-Union Loss for Viewpoint-Agnostic Monocular 3D + Vehicle Detection + + +
+ Monocular 3D object detection is a challenging task because depth information +is difficult to obtain from 2D images. A subset of viewpoint-agnostic monocular +3D detection methods also do not explicitly leverage scene homography or +geometry during training, meaning that a model trained thusly can detect +objects in images from arbitrary viewpoints. Such works predict the projections +of the 3D bounding boxes on the image plane to estimate the location of the 3D +boxes, but these projections are not rectangular so the calculation of IoU +between these projected polygons is not straightforward. This work proposes an +efficient, fully differentiable algorithm for the calculation of IoU between +two convex polygons, which can be utilized to compute the IoU between two 3D +bounding box footprints viewed from an arbitrary angle. We test the performance +of the proposed polygon IoU loss (PIoU loss) on three state-of-the-art +viewpoint-agnostic 3D detection models. Experiments demonstrate that the +proposed PIoU loss converges faster than L1 loss and that in 3D detection +models, a combination of PIoU loss and L1 loss gives better results than L1 +loss alone (+1.64% AP70 for MonoCon on cars, +0.18% AP70 for RTM3D on cars, and ++0.83%/+2.46% AP50/AP25 for MonoRCNN on cyclists). + +
+
+
+
+
+ + ☆ RadarLCD: Learnable Radar-based Loop Closure Detection Pipeline + + +
+ Loop Closure Detection (LCD) is an essential task in robotics and computer +vision, serving as a fundamental component for various applications across +diverse domains. These applications encompass object recognition, image +retrieval, and video analysis. LCD consists in identifying whether a robot has +returned to a previously visited location, referred to as a loop, and then +estimating the related roto-translation with respect to the analyzed location. +Despite the numerous advantages of radar sensors, such as their ability to +operate under diverse weather conditions and provide a wider range of view +compared to other commonly used sensors (e.g., cameras or LiDARs), integrating +radar data remains an arduous task due to intrinsic noise and distortion. To +address this challenge, this research introduces RadarLCD, a novel supervised +deep learning pipeline specifically designed for Loop Closure Detection using +the FMCW Radar (Frequency Modulated Continuous Wave) sensor. RadarLCD, a +learning-based LCD methodology explicitly designed for radar systems, makes a +significant contribution by leveraging the pre-trained HERO (Hybrid Estimation +Radar Odometry) model. Being originally developed for radar odometry, HERO's +features are used to select key points crucial for LCD tasks. The methodology +undergoes evaluation across a variety of FMCW Radar dataset scenes, and it is +compared to state-of-the-art systems such as Scan Context for Place Recognition +and ICP for Loop Closure. The results demonstrate that RadarLCD surpasses the +alternatives in multiple aspects of Loop Closure Detection. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ Developing a Novel Image Marker to Predict the Responses of Neoadjuvant + Chemotherapy (NACT) for Ovarian Cancer Patients + + +
+ Objective: Neoadjuvant chemotherapy (NACT) is one kind of treatment for +advanced stage ovarian cancer patients. However, due to the nature of tumor +heterogeneity, the patients' responses to NACT varies significantly among +different subgroups. To address this clinical challenge, the purpose of this +study is to develop a novel image marker to achieve high accuracy response +prediction of the NACT at an early stage. Methods: For this purpose, we first +computed a total of 1373 radiomics features to quantify the tumor +characteristics, which can be grouped into three categories: geometric, +intensity, and texture features. Second, all these features were optimized by +principal component analysis algorithm to generate a compact and informative +feature cluster. Using this cluster as the input, an SVM based classifier was +developed and optimized to create a final marker, indicating the likelihood of +the patient being responsive to the NACT treatment. To validate this scheme, a +total of 42 ovarian cancer patients were retrospectively collected. A nested +leave-one-out cross-validation was adopted for model performance assessment. +Results: The results demonstrate that the new method yielded an AUC (area under +the ROC [receiver characteristic operation] curve) of 0.745. Meanwhile, the +model achieved overall accuracy of 76.2%, positive predictive value of 70%, and +negative predictive value of 78.1%. Conclusion: This study provides meaningful +information for the development of radiomics based image markers in NACT +response prediction. + +
+
+
+
+
+ + ☆ Mitigating Group Bias in Federated Learning for Heterogeneous Devices + + +
+ Federated Learning is emerging as a privacy-preserving model training +approach in distributed edge applications. As such, most edge deployments are +heterogeneous in nature i.e., their sensing capabilities and environments vary +across deployments. This edge heterogeneity violates the independence and +identical distribution (IID) property of local data across clients and produces +biased global models i.e. models that contribute to unfair decision-making and +discrimination against a particular community or a group. Existing bias +mitigation techniques only focus on bias generated from label heterogeneity in +non-IID data without accounting for domain variations due to feature +heterogeneity and do not address global group-fairness property. + Our work proposes a group-fair FL framework that minimizes group-bias while +preserving privacy and without resource utilization overhead. Our main idea is +to leverage average conditional probabilities to compute a cross-domain group +\textit{importance weights} derived from heterogeneous training data to +optimize the performance of the worst-performing group using a modified +multiplicative weights update method. Additionally, we propose regularization +techniques to minimize the difference between the worst and best-performing +groups while making sure through our thresholding mechanism to strike a balance +between bias reduction and group performance degradation. Our evaluation of +human emotion recognition and image classification benchmarks assesses the fair +decision-making of our framework in real-world heterogeneous settings. + +
+
+
+
+
+ + ☆ SupFusion: Supervised LiDAR-Camera Fusion for 3D Object Detection ICCV2023 + + +
+ In this paper, we propose a novel training strategy called SupFusion, which +provides an auxiliary feature level supervision for effective LiDAR-Camera +fusion and significantly boosts detection performance. Our strategy involves a +data enhancement method named Polar Sampling, which densifies sparse objects +and trains an assistant model to generate high-quality features as the +supervision. These features are then used to train the LiDAR-Camera fusion +model, where the fusion feature is optimized to simulate the generated +high-quality features. Furthermore, we propose a simple yet effective deep +fusion module, which contiguously gains superior performance compared with +previous fusion methods with SupFusion strategy. In such a manner, our proposal +shares the following advantages. Firstly, SupFusion introduces auxiliary +feature-level supervision which could boost LiDAR-Camera detection performance +without introducing extra inference costs. Secondly, the proposed deep fusion +could continuously improve the detector's abilities. Our proposed SupFusion and +deep fusion module is plug-and-play, we make extensive experiments to +demonstrate its effectiveness. Specifically, we gain around 2% 3D mAP +improvements on KITTI benchmark based on multiple LiDAR-Camera 3D detectors. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ FAIR: Frequency-aware Image Restoration for Industrial Visual Anomaly + Detection + + +
+ Image reconstruction-based anomaly detection models are widely explored in +industrial visual inspection. However, existing models usually suffer from the +trade-off between normal reconstruction fidelity and abnormal reconstruction +distinguishability, which damages the performance. In this paper, we find that +the above trade-off can be better mitigated by leveraging the distinct +frequency biases between normal and abnormal reconstruction errors. To this +end, we propose Frequency-aware Image Restoration (FAIR), a novel +self-supervised image restoration task that restores images from their +high-frequency components. It enables precise reconstruction of normal patterns +while mitigating unfavorable generalization to anomalies. Using only a simple +vanilla UNet, FAIR achieves state-of-the-art performance with higher efficiency +on various defect detection datasets. Code: https://github.com/liutongkun/FAIR. + +
+
+ comment: 12 pages, 10 figures +
+
+
+
+
+ + ☆ Aggregating Long-term Sharp Features via Hybrid Transformers for Video + Deblurring + + +
+ Video deblurring methods, aiming at recovering consecutive sharp frames from +a given blurry video, usually assume that the input video suffers from +consecutively blurry frames. However, in real-world blurry videos taken by +modern imaging devices, sharp frames usually appear in the given video, thus +making temporal long-term sharp features available for facilitating the +restoration of a blurry frame. In this work, we propose a video deblurring +method that leverages both neighboring frames and present sharp frames using +hybrid Transformers for feature aggregation. Specifically, we first train a +blur-aware detector to distinguish between sharp and blurry frames. Then, a +window-based local Transformer is employed for exploiting features from +neighboring frames, where cross attention is beneficial for aggregating +features from neighboring frames without explicit spatial alignment. To +aggregate long-term sharp features from detected sharp frames, we utilize a +global Transformer with multi-scale matching capability. Moreover, our method +can easily be extended to event-driven video deblurring by incorporating an +event fusion module into the global Transformer. Extensive experiments on +benchmark datasets demonstrate that our proposed method outperforms +state-of-the-art video deblurring methods as well as event-driven video +deblurring methods in terms of quantitative metrics and visual quality. The +source code and trained models are available at +https://github.com/shangwei5/STGTN. + +
+
+ comment: 13 pages, 11 figures, and the code is available at + https://github.com/shangwei5/STGTN +
+
+
+
+
+ + ☆ Exploiting Multiple Priors for Neural 3D Indoor Reconstruction BMVC + + +
+ Neural implicit modeling permits to achieve impressive 3D reconstruction +results on small objects, while it exhibits significant limitations in large +indoor scenes. In this work, we propose a novel neural implicit modeling method +that leverages multiple regularization strategies to achieve better +reconstructions of large indoor environments, while relying only on images. A +sparse but accurate depth prior is used to anchor the scene to the initial +model. A dense but less accurate depth prior is also introduced, flexible +enough to still let the model diverge from it to improve the estimated +geometry. Then, a novel self-supervised strategy to regularize the estimated +surface normals is presented. Finally, a learnable exposure compensation scheme +permits to cope with challenging lighting conditions. Experimental results show +that our approach produces state-of-the-art 3D reconstructions in challenging +indoor scenarios. + +
+
+ comment: Accepted at the British Machine Vision Conference (BMVC) 2023 +
+
+
+
+
+ + ☆ Instance Adaptive Prototypical Contrastive Embedding for Generalized + Zero Shot Learning IJCAI 2023 + + +
+ Generalized zero-shot learning(GZSL) aims to classify samples from seen and +unseen labels, assuming unseen labels are not accessible during training. +Recent advancements in GZSL have been expedited by incorporating +contrastive-learning-based (instance-based) embedding in generative networks +and leveraging the semantic relationship between data points. However, existing +embedding architectures suffer from two limitations: (1) limited +discriminability of synthetic features' embedding without considering +fine-grained cluster structures; (2) inflexible optimization due to restricted +scaling mechanisms on existing contrastive embedding networks, leading to +overlapped representations in the embedding space. To enhance the quality of +representations in the embedding space, as mentioned in (1), we propose a +margin-based prototypical contrastive learning embedding network that reaps the +benefits of prototype-data (cluster quality enhancement) and implicit data-data +(fine-grained representations) interaction while providing substantial cluster +supervision to the embedding network and the generator. To tackle (2), we +propose an instance adaptive contrastive loss that leads to generalized +representations for unseen labels with increased inter-class margin. Through +comprehensive experimental evaluation, we show that our method can outperform +the current state-of-the-art on three benchmark datasets. Our approach also +consistently achieves the best unseen performance in the GZSL setting. + +
+
+ comment: 7 pages, 4 figures. Accepted in IJCAI 2023 Workshop on Generalizing + from Limited Resources in the Open World +
+
+
+
+
+ + ☆ Differentiable JPEG: The Devil is in the Details WACV 2024 + + +
+ JPEG remains one of the most widespread lossy image coding methods. However, +the non-differentiable nature of JPEG restricts the application in deep +learning pipelines. Several differentiable approximations of JPEG have recently +been proposed to address this issue. This paper conducts a comprehensive review +of existing diff. JPEG approaches and identifies critical details that have +been missed by previous methods. To this end, we propose a novel diff. JPEG +approach, overcoming previous limitations. Our approach is differentiable +w.r.t. the input image, the JPEG quality, the quantization tables, and the +color conversion parameters. We evaluate the forward and backward performance +of our diff. JPEG approach against existing methods. Additionally, extensive +ablations are performed to evaluate crucial design choices. Our proposed diff. +JPEG resembles the (non-diff.) reference implementation best, significantly +surpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For +strong compression rates, we can even improve PSNR by $9.51$dB. Strong +adversarial attack results are yielded by our diff. JPEG, demonstrating the +effective gradient approximation. Our code is available at +https://github.com/necla-ml/Diff-JPEG. + +
+
+ comment: Accepted at WACV 2024. Project page: + https://christophreich1996.github.io/differentiable_jpeg/ +
+
+
+
+
+ + ☆ Towards Reliable Dermatology Evaluation Benchmarks + + +
+ Benchmark datasets for digital dermatology unwittingly contain inaccuracies +that reduce trust in model performance estimates. We propose a +resource-efficient data cleaning protocol to identify issues that escaped +previous curation. The protocol leverages an existing algorithmic cleaning +strategy and is followed by a confirmation process terminated by an intuitive +stopping criterion. Based on confirmation by multiple dermatologists, we remove +irrelevant samples and near duplicates and estimate the percentage of label +errors in six dermatology image datasets for model evaluation promoted by the +International Skin Imaging Collaboration. Along with this paper, we publish +revised file lists for each dataset which should be used for model evaluation. +Our work paves the way for more trustworthy performance assessment in digital +dermatology. + +
+
+ comment: Link to the revised file lists: + https://github.com/Digital-Dermatology/SelfClean-Revised-Benchmarks +
+
+
+
+
+ + ☆ Neural network-based coronary dominance classification of RCA angiograms + + +
+ Background. Cardiac dominance classification is essential for SYNTAX score +estimation, which is a tool used to determine the complexity of coronary artery +disease and guide patient selection toward optimal revascularization strategy. +Objectives. Cardiac dominance classification algorithm based on the analysis of +right coronary artery (RCA) angiograms using neural network Method. We employed +convolutional neural network ConvNext and Swin transformer for 2D image +(frames) classification, along with a majority vote for cardio angiographic +view classification. An auxiliary network was also used to detect irrelevant +images which were then excluded from the data set. Our data set consisted of +828 angiographic studies, 192 of them being patients with left dominance. +Results. 5-fold cross validation gave the following dominance classification +metrics (p=95%): macro recall=93.1%, accuracy=93.5%, macro F1=89.2%. The most +common case in which the model regularly failed was RCA occlusion, as it +requires utilization of LCA information. Another cause for false prediction is +a small diameter combined with poor quality cardio angiographic view. In such +cases, cardiac dominance classification can be complex and may require +discussion among specialists to reach an accurate conclusion. Conclusion. The +use of machine learning approaches to classify cardiac dominance based on RCA +alone has been shown to be successful with satisfactory accuracy. However, for +higher accuracy, it is necessary to utilize LCA information in the case of an +occluded RCA and detect cases where there is high uncertainty. + +
+
+
+
+
+ + ☆ TransNet: A Transfer Learning-Based Network for Human Action Recognition + + +
+ Human action recognition (HAR) is a high-level and significant research area +in computer vision due to its ubiquitous applications. The main limitations of +the current HAR models are their complex structures and lengthy training time. +In this paper, we propose a simple yet versatile and effective end-to-end deep +learning architecture, coined as TransNet, for HAR. TransNet decomposes the +complex 3D-CNNs into 2D- and 1D-CNNs, where the 2D- and 1D-CNN components +extract spatial features and temporal patterns in videos, respectively. +Benefiting from its concise architecture, TransNet is ideally compatible with +any pretrained state-of-the-art 2D-CNN models in other fields, being +transferred to serve the HAR task. In other words, it naturally leverages the +power and success of transfer learning for HAR, bringing huge advantages in +terms of efficiency and effectiveness. Extensive experimental results and the +comparison with the state-of-the-art models demonstrate the superior +performance of the proposed TransNet in HAR in terms of flexibility, model +complexity, training speed and classification accuracy. + +
+
+
+
+
+ + ☆ Limited-Angle Tomography Reconstruction via Deep End-To-End Learning on + Synthetic Data + + +
+ Computed tomography (CT) has become an essential part of modern science and +medicine. A CT scanner consists of an X-ray source that is spun around an +object of interest. On the opposite end of the X-ray source, a detector +captures X-rays that are not absorbed by the object. The reconstruction of an +image is a linear inverse problem, which is usually solved by filtered back +projection. However, when the number of measurements is small, the +reconstruction problem is ill-posed. This is for example the case when the +X-ray source is not spun completely around the object, but rather irradiates +the object only from a limited angle. To tackle this problem, we present a deep +neural network that is trained on a large amount of carefully-crafted synthetic +data and can perform limited-angle tomography reconstruction even for only +30{\deg} or 40{\deg} sinograms. With our approach we won the first place in the +Helsinki Tomography Challenge 2022. + +
+
+
+
+
+ + ☆ DEFormer: DCT-driven Enhancement Transformer for Low-light Image and + Dark Vision ICRA2024 + + +
+ The goal of low-light image enhancement is to restore the color and details +of the image and is of great significance for high-level visual tasks in +autonomous driving. However, it is difficult to restore the lost details in the +dark area by relying only on the RGB domain. In this paper we introduce +frequency as a new clue into the network and propose a novel DCT-driven +enhancement transformer (DEFormer). First, we propose a learnable frequency +branch (LFB) for frequency enhancement contains DCT processing and +curvature-based frequency enhancement (CFE). CFE calculates the curvature of +each channel to represent the detail richness of different frequency bands, +then we divides the frequency features, which focuses on frequency bands with +richer textures. In addition, we propose a cross domain fusion (CDF) for +reducing the differences between the RGB domain and the frequency domain. We +also adopt DEFormer as a preprocessing in dark detection, DEFormer effectively +improves the performance of the detector, bringing 2.1% and 3.4% improvement in +ExDark and DARK FACE datasets on mAP respectively. + +
+
+ comment: submit to ICRA2024 +
+
+
+
+
+ + ☆ DreamStyler: Paint by Style Inversion with Text-to-Image Diffusion + Models + + +
+ Recent progresses in large-scale text-to-image models have yielded remarkable +accomplishments, finding various applications in art domain. However, +expressing unique characteristics of an artwork (e.g. brushwork, colortone, or +composition) with text prompts alone may encounter limitations due to the +inherent constraints of verbal description. To this end, we introduce +DreamStyler, a novel framework designed for artistic image synthesis, +proficient in both text-to-image synthesis and style transfer. DreamStyler +optimizes a multi-stage textual embedding with a context-aware text prompt, +resulting in prominent image quality. In addition, with content and style +guidance, DreamStyler exhibits flexibility to accommodate a range of style +references. Experimental results demonstrate its superior performance across +multiple scenarios, suggesting its promising potential in artistic product +creation. + +
+
+
+
+
+ + ☆ Dynamic Causal Disentanglement Model for Dialogue Emotion Detection + + +
+ Emotion detection is a critical technology extensively employed in diverse +fields. While the incorporation of commonsense knowledge has proven beneficial +for existing emotion detection methods, dialogue-based emotion detection +encounters numerous difficulties and challenges due to human agency and the +variability of dialogue content.In dialogues, human emotions tend to accumulate +in bursts. However, they are often implicitly expressed. This implies that many +genuine emotions remain concealed within a plethora of unrelated words and +dialogues.In this paper, we propose a Dynamic Causal Disentanglement Model +based on hidden variable separation, which is founded on the separation of +hidden variables. This model effectively decomposes the content of dialogues +and investigates the temporal accumulation of emotions, thereby enabling more +precise emotion recognition. First, we introduce a novel Causal Directed +Acyclic Graph (DAG) to establish the correlation between hidden emotional +information and other observed elements. Subsequently, our approach utilizes +pre-extracted personal attributes and utterance topics as guiding factors for +the distribution of hidden variables, aiming to separate irrelevant ones. +Specifically, we propose a dynamic temporal disentanglement model to infer the +propagation of utterances and hidden variables, enabling the accumulation of +emotion-related information throughout the conversation. To guide this +disentanglement process, we leverage the ChatGPT-4.0 and LSTM networks to +extract utterance topics and personal attributes as observed +information.Finally, we test our approach on two popular datasets in dialogue +emotion detection and relevant experimental results verified the model's +superiority. + +
+
+
+
+
+ + ☆ Contrast-Phys+: Unsupervised and Weakly-supervised Video-based Remote + Physiological Measurement via Spatiotemporal Contrast + + +
+ Video-based remote physiological measurement utilizes facial videos to +measure the blood volume change signal, which is also called remote +photoplethysmography (rPPG). Supervised methods for rPPG measurements have been +shown to achieve good performance. However, the drawback of these methods is +that they require facial videos with ground truth (GT) physiological signals, +which are often costly and difficult to obtain. In this paper, we propose +Contrast-Phys+, a method that can be trained in both unsupervised and +weakly-supervised settings. We employ a 3DCNN model to generate multiple +spatiotemporal rPPG signals and incorporate prior knowledge of rPPG into a +contrastive loss function. We further incorporate the GT signals into +contrastive learning to adapt to partial or misaligned labels. The contrastive +loss encourages rPPG/GT signals from the same video to be grouped together, +while pushing those from different videos apart. We evaluate our methods on +five publicly available datasets that include both RGB and Near-infrared +videos. Contrast-Phys+ outperforms the state-of-the-art supervised methods, +even when using partially available or misaligned GT signals, or no labels at +all. Additionally, we highlight the advantages of our methods in terms of +computational efficiency, noise robustness, and generalization. + +
+
+
+
+
+ + ☆ Hydra: Multi-head Low-rank Adaptation for Parameter Efficient + Fine-tuning + + +
+ The recent surge in large-scale foundation models has spurred the development +of efficient methods for adapting these models to various downstream tasks. +Low-rank adaptation methods, such as LoRA, have gained significant attention +due to their outstanding parameter efficiency and no additional inference +latency. This paper investigates a more general form of adapter module based on +the analysis that parallel and sequential adaptation branches learn novel and +general features during fine-tuning, respectively. The proposed method, named +Hydra, due to its multi-head computational branches, combines parallel and +sequential branch to integrate capabilities, which is more expressive than +existing single branch methods and enables the exploration of a broader range +of optimal points in the fine-tuning process. In addition, the proposed +adaptation method explicitly leverages the pre-trained weights by performing a +linear combination of the pre-trained features. It allows the learned features +to have better generalization performance across diverse downstream tasks. +Furthermore, we perform a comprehensive analysis of the characteristics of each +adaptation branch with empirical evidence. Through an extensive range of +experiments, encompassing comparisons and ablation studies, we substantiate the +efficiency and demonstrate the superior performance of Hydra. This +comprehensive evaluation underscores the potential impact and effectiveness of +Hydra in a variety of applications. Our code is available on +\url{https://github.com/extremebird/Hydra} + +
+
+
+
+
+ + ☆ CCSPNet-Joint: Efficient Joint Training Method for Traffic Sihn + Detection Under Extreme Conditions + + +
+ Traffic sign detection is an important research direction in intelligent +driving. Unfortunately, existing methods often overlook extreme conditions such +as fog, rain, and motion blur. Moreover, the end-to-end training strategy for +image denoising and object detection models fails to utilize inter-model +information effectively. To address these issues, we propose CCSPNet, an +efficient feature extraction module based on Transformers and CNNs, which +effectively leverages contextual information, achieves faster inference speed +and provides stronger feature enhancement capabilities. Furthermore, we +establish the correlation between object detection and image denoising tasks +and propose a joint training model, CCSPNet-Joint, to improve data efficiency +and generalization. Finally, to validate our approach, we create the CCTSDB-AUG +dataset for traffic sign detection in extreme scenarios. Extensive experiments +have shown that CCSPNet achieves state-of-the-art performance in traffic sign +detection under extreme conditions. Compared to end-to-end methods, +CCSPNet-Joint achieves a 5.32% improvement in precision and an 18.09% +improvement in mAP@.5. + +
+
+
+
+
+ + ☆ MagiCapture: High-Resolution Multi-Concept Portrait Customization + + +
+ Large-scale text-to-image models including Stable Diffusion are capable of +generating high-fidelity photorealistic portrait images. There is an active +research area dedicated to personalizing these models, aiming to synthesize +specific subjects or styles using provided sets of reference images. However, +despite the plausible results from these personalization methods, they tend to +produce images that often fall short of realism and are not yet on a +commercially viable level. This is particularly noticeable in portrait image +generation, where any unnatural artifact in human faces is easily discernible +due to our inherent human bias. To address this, we introduce MagiCapture, a +personalization method for integrating subject and style concepts to generate +high-resolution portrait images using just a few subject and style references. +For instance, given a handful of random selfies, our fine-tuned model can +generate high-quality portrait images in specific styles, such as passport or +profile photos. The main challenge with this task is the absence of ground +truth for the composed concepts, leading to a reduction in the quality of the +final output and an identity shift of the source subject. To address these +issues, we present a novel Attention Refocusing loss coupled with auxiliary +priors, both of which facilitate robust learning within this weakly supervised +learning setting. Our pipeline also includes additional post-processing steps +to ensure the creation of highly realistic outputs. MagiCapture outperforms +other baselines in both quantitative and qualitative evaluations and can also +be generalized to other non-human objects. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Keep It SimPool: Who Said Supervised Transformers Suffer from Attention + Deficit? ICCV 2023 + + +
+ Convolutional networks and vision transformers have different forms of +pairwise interactions, pooling across layers and pooling at the end of the +network. Does the latter really need to be different? As a by-product of +pooling, vision transformers provide spatial attention for free, but this is +most often of low quality unless self-supervised, which is not well studied. Is +supervision really the problem? + In this work, we develop a generic pooling framework and then we formulate a +number of existing methods as instantiations. By discussing the properties of +each group of methods, we derive SimPool, a simple attention-based pooling +mechanism as a replacement of the default one for both convolutional and +transformer encoders. We find that, whether supervised or self-supervised, this +improves performance on pre-training and downstream tasks and provides +attention maps delineating object boundaries in all cases. One could thus call +SimPool universal. To our knowledge, we are the first to obtain attention maps +in supervised transformers of at least as good quality as self-supervised, +without explicit losses or modifying the architecture. Code at: +https://github.com/billpsomas/simpool. + +
+
+ comment: ICCV 2023. Code and models: https://github.com/billpsomas/simpool +
+
+
+
+
+ + ☆ Manufacturing Quality Control with Autoencoder-Based Defect Localization + and Unsupervised Class Selection + + +
+ Manufacturing industries require efficient and voluminous production of +high-quality finished goods. In the context of Industry 4.0, visual anomaly +detection poses an optimistic solution for automatically controlling product +quality with high precision. Automation based on computer vision poses a +promising solution to prevent bottlenecks at the product quality checkpoint. We +considered recent advancements in machine learning to improve visual defect +localization, but challenges persist in obtaining a balanced feature set and +database of the wide variety of defects occurring in the production line. This +paper proposes a defect localizing autoencoder with unsupervised class +selection by clustering with k-means the features extracted from a pre-trained +VGG-16 network. The selected classes of defects are augmented with natural wild +textures to simulate artificial defects. The study demonstrates the +effectiveness of the defect localizing autoencoder with unsupervised class +selection for improving defect detection in manufacturing industries. The +proposed methodology shows promising results with precise and accurate +localization of quality defects on melamine-faced boards for the furniture +industry. Incorporating artificial defects into the training data shows +significant potential for practical implementation in real-world quality +control scenarios. + +
+
+
+
+
+ + ☆ ProMap: Datasets for Product Mapping in E-commerce + + +
+ The goal of product mapping is to decide, whether two listings from two +different e-shops describe the same products. Existing datasets of matching and +non-matching pairs of products, however, often suffer from incomplete product +information or contain only very distant non-matching products. Therefore, +while predictive models trained on these datasets achieve good results on them, +in practice, they are unusable as they cannot distinguish very similar but +non-matching pairs of products. This paper introduces two new datasets for +product mapping: ProMapCz consisting of 1,495 Czech product pairs and ProMapEn +consisting of 1,555 English product pairs of matching and non-matching products +manually scraped from two pairs of e-shops. The datasets contain both images +and textual descriptions of the products, including their specifications, +making them one of the most complete datasets for product mapping. +Additionally, the non-matching products were selected in two phases, creating +two types of non-matches -- close non-matches and medium non-matches. Even the +medium non-matches are pairs of products that are much more similar than +non-matches in other datasets -- for example, they still need to have the same +brand and similar name and price. After simple data preprocessing, several +machine learning algorithms were trained on these and two the other datasets to +demonstrate the complexity and completeness of ProMap datasets. ProMap datasets +are presented as a golden standard for further research of product mapping +filling the gaps in existing ones. + +
+
+
+
+
+ + ☆ Video Infringement Detection via Feature Disentanglement and Mutual + Information Maximization ACM MM 2023 + + +
+ The self-media era provides us tremendous high quality videos. Unfortunately, +frequent video copyright infringements are now seriously damaging the interests +and enthusiasm of video creators. Identifying infringing videos is therefore a +compelling task. Current state-of-the-art methods tend to simply feed +high-dimensional mixed video features into deep neural networks and count on +the networks to extract useful representations. Despite its simplicity, this +paradigm heavily relies on the original entangled features and lacks +constraints guaranteeing that useful task-relevant semantics are extracted from +the features. + In this paper, we seek to tackle the above challenges from two aspects: (1) +We propose to disentangle an original high-dimensional feature into multiple +sub-features, explicitly disentangling the feature into exclusive +lower-dimensional components. We expect the sub-features to encode +non-overlapping semantics of the original feature and remove redundant +information. + (2) On top of the disentangled sub-features, we further learn an auxiliary +feature to enhance the sub-features. We theoretically analyzed the mutual +information between the label and the disentangled features, arriving at a loss +that maximizes the extraction of task-relevant information from the original +feature. + Extensive experiments on two large-scale benchmark datasets (i.e., SVD and +VCSL) demonstrate that our method achieves 90.1% TOP-100 mAP on the large-scale +SVD dataset and also sets the new state-of-the-art on the VCSL benchmark +dataset. Our code and model have been released at +https://github.com/yyyooooo/DMI/, hoping to contribute to the community. + +
+
+ comment: This paper is accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ UniBrain: Universal Brain MRI Diagnosis with Hierarchical + Knowledge-enhanced Pre-training + + +
+ Magnetic resonance imaging~(MRI) have played a crucial role in brain disease +diagnosis, with which a range of computer-aided artificial intelligence methods +have been proposed. However, the early explorations usually focus on the +limited types of brain diseases in one study and train the model on the data in +a small scale, yielding the bottleneck of generalization. Towards a more +effective and scalable paradigm, we propose a hierarchical knowledge-enhanced +pre-training framework for the universal brain MRI diagnosis, termed as +UniBrain. Specifically, UniBrain leverages a large-scale dataset of 24,770 +imaging-report pairs from routine diagnostics. Different from previous +pre-training techniques for the unitary vision or textual feature, or with the +brute-force alignment between vision and language information, we leverage the +unique characteristic of report information in different granularity to build a +hierarchical alignment mechanism, which strengthens the efficiency in feature +learning. Our UniBrain is validated on three real world datasets with severe +class imbalance and the public BraTS2019 dataset. It not only consistently +outperforms all state-of-the-art diagnostic methods by a large margin and +provides a superior grounding performance but also shows comparable performance +compared to expert radiologists on certain disease types. + +
+
+
+
+
+ + ☆ Topology-inspired Cross-domain Network for Developmental Cervical + Stenosis Quantification + + +
+ Developmental Canal Stenosis (DCS) quantification is crucial in cervical +spondylosis screening. Compared with quantifying DCS manually, a more efficient +and time-saving manner is provided by deep keypoint localization networks, +which can be implemented in either the coordinate or the image domain. However, +the vertebral visualization features often lead to abnormal topological +structures during keypoint localization, including keypoint distortion with +edges and weakly connected structures, which cannot be fully suppressed in +either the coordinate or image domain alone. To overcome this limitation, a +keypoint-edge and a reparameterization modules are utilized to restrict these +abnormal structures in a cross-domain manner. The keypoint-edge constraint +module restricts the keypoints on the edges of vertebrae, which ensures that +the distribution pattern of keypoint coordinates is consistent with those for +DCS quantification. And the reparameterization module constrains the weakly +connected structures in image-domain heatmaps with coordinates combined. +Moreover, the cross-domain network improves spatial generalization by utilizing +heatmaps and incorporating coordinates for accurate localization, which avoids +the trade-off between these two properties in an individual domain. +Comprehensive results of distinct quantification tasks show the superiority and +generability of the proposed Topology-inspired Cross-domain Network (TCN) +compared with other competing localization methods. + +
+
+
+
+
+ + ☆ SAMUS: Adapting Segment Anything Model for Clinically-Friendly and + Generalizable Ultrasound Image Segmentation + + +
+ Segment anything model (SAM), an eminent universal image segmentation model, +has recently gathered considerable attention within the domain of medical image +segmentation. Despite the remarkable performance of SAM on natural images, it +grapples with significant performance degradation and limited generalization +when confronted with medical images, particularly with those involving objects +of low contrast, faint boundaries, intricate shapes, and diminutive sizes. In +this paper, we propose SAMUS, a universal model tailored for ultrasound image +segmentation. In contrast to previous SAM-based universal models, SAMUS pursues +not only better generalization but also lower deployment cost, rendering it +more suitable for clinical applications. Specifically, based on SAM, a parallel +CNN branch is introduced to inject local features into the ViT encoder through +cross-branch attention for better medical image segmentation. Then, a position +adapter and a feature adapter are developed to adapt SAM from natural to +medical domains and from requiring large-size inputs (1024x1024) to small-size +inputs (256x256) for more clinical-friendly deployment. A comprehensive +ultrasound dataset, comprising about 30k images and 69k masks and covering six +object categories, is collected for verification. Extensive comparison +experiments demonstrate SAMUS's superiority against the state-of-the-art +task-specific models and universal foundation models under both task-specific +evaluation and generalization evaluation. Moreover, SAMUS is deployable on +entry-level GPUs, as it has been liberated from the constraints of long +sequence encoding. The code, data, and models will be released at +https://github.com/xianlin7/SAMUS. + +
+
+
+
+
+ + ☆ Tracking Particles Ejected From Active Asteroid Bennu With Event-Based + Vision + + +
+ Early detection and tracking of ejecta in the vicinity of small solar system +bodies is crucial to guarantee spacecraft safety and support scientific +observation. During the visit of active asteroid Bennu, the OSIRIS-REx +spacecraft relied on the analysis of images captured by onboard navigation +cameras to detect particle ejection events, which ultimately became one of the +mission's scientific highlights. To increase the scientific return of similar +time-constrained missions, this work proposes an event-based solution that is +dedicated to the detection and tracking of centimetre-sized particles. Unlike a +standard frame-based camera, the pixels of an event-based camera independently +trigger events indicating whether the scene brightness has increased or +decreased at that time and location in the sensor plane. As a result of the +sparse and asynchronous spatiotemporal output, event cameras combine very high +dynamic range and temporal resolution with low-power consumption, which could +complement existing onboard imaging techniques. This paper motivates the use of +a scientific event camera by reconstructing the particle ejection episodes +reported by the OSIRIS-REx mission in a photorealistic scene generator and in +turn, simulating event-based observations. The resulting streams of +spatiotemporal data support future work on event-based multi-object tracking. + +
+
+ comment: 6 pages, 3 figures, presented at the XXVII Italian Association of + Aeronautics and Astronautics (AIDAA) Congress, 4-7 September 2023, Padova + Italy +
+
+
+
+
+ + ☆ Leveraging SE(3) Equivariance for Learning 3D Geometric Shape Assembly ICCV 2023 + + +
+ Shape assembly aims to reassemble parts (or fragments) into a complete +object, which is a common task in our daily life. Different from the semantic +part assembly (e.g., assembling a chair's semantic parts like legs into a whole +chair), geometric part assembly (e.g., assembling bowl fragments into a +complete bowl) is an emerging task in computer vision and robotics. Instead of +semantic information, this task focuses on geometric information of parts. As +the both geometric and pose space of fractured parts are exceptionally large, +shape pose disentanglement of part representations is beneficial to geometric +shape assembly. In our paper, we propose to leverage SE(3) equivariance for +such shape pose disentanglement. Moreover, while previous works in vision and +robotics only consider SE(3) equivariance for the representations of single +objects, we move a step forward and propose leveraging SE(3) equivariance for +representations considering multi-part correlations, which further boosts the +performance of the multi-part assembly. Experiments demonstrate the +significance of SE(3) equivariance and our proposed method for geometric shape +assembly. Project page: https://crtie.github.io/SE-3-part-assembly/ + +
+
+ comment: ICCV 2023, Project page: https://crtie.github.io/SE-3-part-assembly/ + , Code: + https://github.com/crtie/Leveraging-SE-3-Equivariance-for-Learning-3D-Geometric-Shape-Assembly +
+
+
+
+
+ + ☆ TAP: Targeted Prompting for Task Adaptive Generation of Textual Training + Instances for Visual Classification + + +
+ Vision and Language Models (VLMs), such as CLIP, have enabled visual +recognition of a potentially unlimited set of categories described by text +prompts. However, for the best visual recognition performance, these models +still require tuning to better fit the data distributions of the downstream +tasks, in order to overcome the domain shift from the web-based pre-training +data. Recently, it has been shown that it is possible to effectively tune VLMs +without any paired data, and in particular to effectively improve VLMs visual +recognition performance using text-only training data generated by Large +Language Models (LLMs). In this paper, we dive deeper into this exciting +text-only VLM training approach and explore ways it can be significantly +further improved taking the specifics of the downstream task into account when +sampling text data from LLMs. In particular, compared to the SOTA text-only VLM +training approach, we demonstrate up to 8.4% performance improvement in (cross) +domain-specific adaptation, up to 8.7% improvement in fine-grained recognition, +and 3.1% overall average improvement in zero-shot classification compared to +strong baselines. + +
+
+ comment: Code is available at: https://github.com/jmiemirza/TAP +
+
+
+
+
+ + ☆ Bayesian uncertainty-weighted loss for improved generalisability on + polyp segmentation task MICCAI 2023 + + +
+ While several previous studies have devised methods for segmentation of +polyps, most of these methods are not rigorously assessed on multi-center +datasets. Variability due to appearance of polyps from one center to another, +difference in endoscopic instrument grades, and acquisition quality result in +methods with good performance on in-distribution test data, and poor +performance on out-of-distribution or underrepresented samples. Unfair models +have serious implications and pose a critical challenge to clinical +applications. We adapt an implicit bias mitigation method which leverages +Bayesian epistemic uncertainties during training to encourage the model to +focus on underrepresented sample regions. We demonstrate the potential of this +approach to improve generalisability without sacrificing state-of-the-art +performance on a challenging multi-center polyp segmentation dataset (PolypGen) +with different centers and image modalities. + +
+
+ comment: To be presented at the Fairness of AI in Medical Imaging (FAIMI) + MICCAI 2023 Workshop and published in volumes of the Springer Lecture Notes + Computer Science (LNCS) series +
+
+
+
+
+ + ☆ Dynamic NeRFs for Soccer Scenes + + +
+ The long-standing problem of novel view synthesis has many applications, +notably in sports broadcasting. Photorealistic novel view synthesis of soccer +actions, in particular, is of enormous interest to the broadcast industry. Yet +only a few industrial solutions have been proposed, and even fewer that achieve +near-broadcast quality of the synthetic replays. Except for their setup of +multiple static cameras around the playfield, the best proprietary systems +disclose close to no information about their inner workings. Leveraging +multiple static cameras for such a task indeed presents a challenge rarely +tackled in the literature, for a lack of public datasets: the reconstruction of +a large-scale, mostly static environment, with small, fast-moving elements. +Recently, the emergence of neural radiance fields has induced stunning progress +in many novel view synthesis applications, leveraging deep learning principles +to produce photorealistic results in the most challenging settings. In this +work, we investigate the feasibility of basing a solution to the task on +dynamic NeRFs, i.e., neural models purposed to reconstruct general dynamic +content. We compose synthetic soccer environments and conduct multiple +experiments using them, identifying key components that help reconstruct soccer +scenes with dynamic NeRFs. We show that, although this approach cannot fully +meet the quality requirements for the target application, it suggests promising +avenues toward a cost-efficient, automatic solution. We also make our work +dataset and code publicly available, with the goal to encourage further efforts +from the research community on the task of novel view synthesis for dynamic +soccer scenes. For code, data, and video results, please see +https://soccernerfs.isach.be. + +
+
+ comment: Accepted at the 6th International ACM Workshop on Multimedia Content + Analysis in Sports. 8 pages, 9 figures. Project page: + https://soccernerfs.isach.be +
+
+
+
+
+ + ☆ Motion-Bias-Free Feature-Based SLAM BMVC 2023 + + +
+ For SLAM to be safely deployed in unstructured real world environments, it +must possess several key properties that are not encompassed by conventional +benchmarks. In this paper we show that SLAM commutativity, that is, consistency +in trajectory estimates on forward and reverse traverses of the same route, is +a significant issue for the state of the art. Current pipelines show a +significant bias between forward and reverse directions of travel, that is in +addition inconsistent regarding which direction of travel exhibits better +performance. In this paper we propose several contributions to feature-based +SLAM pipelines that remedies the motion bias problem. In a comprehensive +evaluation across four datasets, we show that our contributions implemented in +ORB-SLAM2 substantially reduce the bias between forward and backward motion and +additionally improve the aggregated trajectory error. Removing the SLAM motion +bias has significant relevance for the wide range of robotics and computer +vision applications where performance consistency is important. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ☆ Remote Sensing Object Detection Meets Deep Learning: A Meta-review of + Challenges and Advances + + +
+ Remote sensing object detection (RSOD), one of the most fundamental and +challenging tasks in the remote sensing field, has received longstanding +attention. In recent years, deep learning techniques have demonstrated robust +feature representation capabilities and led to a big leap in the development of +RSOD techniques. In this era of rapid technical evolution, this review aims to +present a comprehensive review of the recent achievements in deep learning +based RSOD methods. More than 300 papers are covered in this review. We +identify five main challenges in RSOD, including multi-scale object detection, +rotated object detection, weak object detection, tiny object detection, and +object detection with limited supervision, and systematically review the +corresponding methods developed in a hierarchical division manner. We also +review the widely used benchmark datasets and evaluation metrics within the +field of RSOD, as well as the application scenarios for RSOD. Future research +directions are provided for further promoting the research in RSOD. + +
+
+ comment: Accepted with IEEE Geoscience and Remote Sensing Magazine. More than + 300 papers relevant to the RSOD filed were reviewed in this survey +
+
+
+
+
+ + ☆ MFL-YOLO: An Object Detection Model for Damaged Traffic Signs + + +
+ Traffic signs are important facilities to ensure traffic safety and smooth +flow, but may be damaged due to many reasons, which poses a great safety +hazard. Therefore, it is important to study a method to detect damaged traffic +signs. Existing object detection techniques for damaged traffic signs are still +absent. Since damaged traffic signs are closer in appearance to normal ones, it +is difficult to capture the detailed local damage features of damaged traffic +signs using traditional object detection methods. In this paper, we propose an +improved object detection method based on YOLOv5s, namely MFL-YOLO (Mutual +Feature Levels Loss enhanced YOLO). We designed a simple cross-level loss +function so that each level of the model has its own role, which is beneficial +for the model to be able to learn more diverse features and improve the fine +granularity. The method can be applied as a plug-and-play module and it does +not increase the structural complexity or the computational complexity while +improving the accuracy. We also replaced the traditional convolution and CSP +with the GSConv and VoVGSCSP in the neck of YOLOv5s to reduce the scale and +computational complexity. Compared with YOLOv5s, our MFL-YOLO improves 4.3 and +5.1 in F1 scores and mAP, while reducing the FLOPs by 8.9%. The Grad-CAM heat +map visualization shows that our model can better focus on the local details of +the damaged traffic signs. In addition, we also conducted experiments on +CCTSDB2021 and TT100K to further validate the generalization of our model. + +
+
+ comment: 11 pages, 8 figures, 4 tables +
+
+
+
+
+ + ☆ Integrating GAN and Texture Synthesis for Enhanced Road Damage Detection + + +
+ In the domain of traffic safety and road maintenance, precise detection of +road damage is crucial for ensuring safe driving and prolonging road +durability. However, current methods often fall short due to limited data. +Prior attempts have used Generative Adversarial Networks to generate damage +with diverse shapes and manually integrate it into appropriate positions. +However, the problem has not been well explored and is faced with two +challenges. First, they only enrich the location and shape of damage while +neglect the diversity of severity levels, and the realism still needs further +improvement. Second, they require a significant amount of manual effort. To +address these challenges, we propose an innovative approach. In addition to +using GAN to generate damage with various shapes, we further employ texture +synthesis techniques to extract road textures. These two elements are then +mixed with different weights, allowing us to control the severity of the +synthesized damage, which are then embedded back into the original images via +Poisson blending. Our method ensures both richness of damage severity and a +better alignment with the background. To save labor costs, we leverage +structural similarity for automated sample selection during embedding. Each +augmented data of an original image contains versions with varying severity +levels. We implement a straightforward screening strategy to mitigate +distribution drift. Experiments are conducted on a public road damage dataset. +The proposed method not only eliminates the need for manual labor but also +achieves remarkable enhancements, improving the mAP by 4.1% and the F1-score by +4.5%. + +
+
+ comment: 10 pages, 13 figures, 2 Tables +
+
+
+
+
+ + ☆ VEATIC: Video-based Emotion and Affect Tracking in Context Dataset + + +
+ Human affect recognition has been a significant topic in psychophysics and +computer vision. However, the currently published datasets have many +limitations. For example, most datasets contain frames that contain only +information about facial expressions. Due to the limitations of previous +datasets, it is very hard to either understand the mechanisms for affect +recognition of humans or generalize well on common cases for computer vision +models trained on those datasets. In this work, we introduce a brand new large +dataset, the Video-based Emotion and Affect Tracking in Context Dataset +(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has +124 video clips from Hollywood movies, documentaries, and home videos with +continuous valence and arousal ratings of each frame via real-time annotation. +Along with the dataset, we propose a new computer vision task to infer the +affect of the selected character via both context and character information in +each video frame. Additionally, we propose a simple model to benchmark this new +computer vision task. We also compare the performance of the pretrained model +using our dataset with other similar datasets. Experiments show the competing +results of our pretrained model via VEATIC, indicating the generalizability of +VEATIC. Our dataset is available at https://veatic.github.io. + +
+
+
+
+
+ + ☆ MTD: Multi-Timestep Detector for Delayed Streaming Perception + + +
+ Autonomous driving systems require real-time environmental perception to +ensure user safety and experience. Streaming perception is a task of reporting +the current state of the world, which is used to evaluate the delay and +accuracy of autonomous driving systems. In real-world applications, factors +such as hardware limitations and high temperatures inevitably cause delays in +autonomous driving systems, resulting in the offset between the model output +and the world state. In order to solve this problem, this paper propose the +Multi- Timestep Detector (MTD), an end-to-end detector which uses dynamic +routing for multi-branch future prediction, giving model the ability to resist +delay fluctuations. A Delay Analysis Module (DAM) is proposed to optimize the +existing delay sensing method, continuously monitoring the model inference +stack and calculating the delay trend. Moreover, a novel Timestep Branch Module +(TBM) is constructed, which includes static flow and adaptive flow to +adaptively predict specific timesteps according to the delay trend. The +proposed method has been evaluated on the Argoverse-HD dataset, and the +experimental results show that it has achieved state-of-the-art performance +across various delay settings. + +
+
+ comment: 12 pages, accepted by PRCV 2023 (The 6th Chinese Conference on + Pattern Recognition and Computer Vision) +
+
+
+
+
+ + ☆ GelFlow: Self-supervised Learning of Optical Flow for Vision-Based + Tactile Sensor Displacement Measurement + + +
+ High-resolution multi-modality information acquired by vision-based tactile +sensors can support more dexterous manipulations for robot fingers. Optical +flow is low-level information directly obtained by vision-based tactile +sensors, which can be transformed into other modalities like force, geometry +and depth. Current vision-tactile sensors employ optical flow methods from +OpenCV to estimate the deformation of markers in gels. However, these methods +need to be more precise for accurately measuring the displacement of markers +during large elastic deformation of the gel, as this can significantly impact +the accuracy of downstream tasks. This study proposes a self-supervised optical +flow method based on deep learning to achieve high accuracy in displacement +measurement for vision-based tactile sensors. The proposed method employs a +coarse-to-fine strategy to handle large deformations by constructing a +multi-scale feature pyramid from the input image. To better deal with the +elastic deformation caused by the gel, the Helmholtz velocity decomposition +constraint combined with the elastic deformation constraint are adopted to +address the distortion rate and area change rate, respectively. A local flow +fusion module is designed to smooth the optical flow, taking into account the +prior knowledge of the blurred effect of gel deformation. We trained the +proposed self-supervised network using an open-source dataset and compared it +with traditional and deep learning-based optical flow methods. The results show +that the proposed method achieved the highest displacement measurement +accuracy, thereby demonstrating its potential for enabling more precise +measurement of downstream tasks using vision-based tactile sensors. + +
+
+
+
+
+ + ☆ Leveraging Foundation models for Unsupervised Audio-Visual Segmentation + + +
+ Audio-Visual Segmentation (AVS) aims to precisely outline audible objects in +a visual scene at the pixel level. Existing AVS methods require fine-grained +annotations of audio-mask pairs in supervised learning fashion. This limits +their scalability since it is time consuming and tedious to acquire such +cross-modality pixel level labels. To overcome this obstacle, in this work we +introduce unsupervised audio-visual segmentation with no need for task-specific +data annotations and model training. For tackling this newly proposed problem, +we formulate a novel Cross-Modality Semantic Filtering (CMSF) approach to +accurately associate the underlying audio-mask pairs by leveraging the +off-the-shelf multi-modal foundation models (e.g., detection [1], open-world +segmentation [2] and multi-modal alignment [3]). Guiding the proposal +generation by either audio or visual cues, we design two training-free +variants: AT-GDINO-SAM and OWOD-BIND. Extensive experiments on the AVS-Bench +dataset show that our unsupervised approach can perform well in comparison to +prior art supervised counterparts across complex scenarios with multiple +auditory objects. Particularly, in situations where existing supervised AVS +methods struggle with overlapping foreground objects, our models still excel in +accurately segmenting overlapped auditory objects. Our code will be publicly +released. + +
+
+
+
+
+ + ☆ Deep Nonparametric Convexified Filtering for Computational Photography, + Image Synthesis and Adversarial Defense + + +
+ We aim to provide a general framework of for computational photography that +recovers the real scene from imperfect images, via the Deep Nonparametric +Convexified Filtering (DNCF). It is consists of a nonparametric deep network to +resemble the physical equations behind the image formation, such as denoising, +super-resolution, inpainting, and flash. DNCF has no parameterization dependent +on training data, therefore has a strong generalization and robustness to +adversarial image manipulation. During inference, we also encourage the network +parameters to be nonnegative and create a bi-convex function on the input and +parameters, and this adapts to second-order optimization algorithms with +insufficient running time, having 10X acceleration over Deep Image Prior. With +these tools, we empirically verify its capability to defend image +classification deep networks against adversary attack algorithms in real-time. + +
+
+
+
+
+ + ☆ Dynamic Spectrum Mixer for Visual Recognition + + +
+ Recently, MLP-based vision backbones have achieved promising performance in +several visual recognition tasks. However, the existing MLP-based methods +directly aggregate tokens with static weights, leaving the adaptability to +different images untouched. Moreover, Recent research demonstrates that +MLP-Transformer is great at creating long-range dependencies but ineffective at +catching high frequencies that primarily transmit local information, which +prevents it from applying to the downstream dense prediction tasks, such as +semantic segmentation. To address these challenges, we propose a +content-adaptive yet computationally efficient structure, dubbed Dynamic +Spectrum Mixer (DSM). The DSM represents token interactions in the frequency +domain by employing the Discrete Cosine Transform, which can learn long-term +spatial dependencies with log-linear complexity. Furthermore, a dynamic +spectrum weight generation layer is proposed as the spectrum bands selector, +which could emphasize the informative frequency bands while diminishing others. +To this end, the technique can efficiently learn detailed features from visual +input that contains both high- and low-frequency information. Extensive +experiments show that DSM is a powerful and adaptable backbone for a range of +visual recognition tasks. Particularly, DSM outperforms previous +transformer-based and MLP-based models, on image classification, object +detection, and semantic segmentation tasks, such as 83.8 \% top-1 accuracy on +ImageNet, and 49.9 \% mIoU on ADE20K. + +
+
+
+
+
+ + ☆ Deep Attentive Time Warping + + +
+ Similarity measures for time series are important problems for time series +classification. To handle the nonlinear time distortions, Dynamic Time Warping +(DTW) has been widely used. However, DTW is not learnable and suffers from a +trade-off between robustness against time distortion and discriminative power. +In this paper, we propose a neural network model for task-adaptive time +warping. Specifically, we use the attention model, called the bipartite +attention model, to develop an explicit time warping mechanism with greater +distortion invariance. Unlike other learnable models using DTW for warping, our +model predicts all local correspondences between two time series and is trained +based on metric learning, which enables it to learn the optimal data-dependent +warping for the target task. We also propose to induce pre-training of our +model by DTW to improve the discriminative power. Extensive experiments +demonstrate the superior effectiveness of our model over DTW and its +state-of-the-art performance in online signature verification. + +
+
+ comment: Accepted at Pattern Recognition +
+
+
+
+
+ + ☆ MPI-Flow: Learning Realistic Optical Flow with Multiplane Images ICCV2023 + + +
+ The accuracy of learning-based optical flow estimation models heavily relies +on the realism of the training datasets. Current approaches for generating such +datasets either employ synthetic data or generate images with limited realism. +However, the domain gap of these data with real-world scenes constrains the +generalization of the trained model to real-world applications. To address this +issue, we investigate generating realistic optical flow datasets from +real-world images. Firstly, to generate highly realistic new images, we +construct a layered depth representation, known as multiplane images (MPI), +from single-view images. This allows us to generate novel view images that are +highly realistic. To generate optical flow maps that correspond accurately to +the new image, we calculate the optical flows of each plane using the camera +matrix and plane depths. We then project these layered optical flows into the +output optical flow map with volume rendering. Secondly, to ensure the realism +of motion, we present an independent object motion module that can separate the +camera and dynamic object motion in MPI. This module addresses the deficiency +in MPI-based single-view methods, where optical flow is generated only by +camera motion and does not account for any object movement. We additionally +devise a depth-aware inpainting module to merge new images with dynamic objects +and address unnatural motion occlusions. We show the superior performance of +our method through extensive experiments on real-world datasets. Moreover, our +approach achieves state-of-the-art performance in both unsupervised and +supervised training of learning-based models. The code will be made publicly +available at: \url{https://github.com/Sharpiless/MPI-Flow}. + +
+
+ comment: Accepted to ICCV2023 +
+
+
+
+
+ + ☆ VLSlice: Interactive Vision-and-Language Slice Discovery ICCV 2023 + + +
+ Recent work in vision-and-language demonstrates that large-scale pretraining +can learn generalizable models that are efficiently transferable to downstream +tasks. While this may improve dataset-scale aggregate metrics, analyzing +performance around hand-crafted subgroups targeting specific bias dimensions +reveals systemic undesirable behaviors. However, this subgroup analysis is +frequently stalled by annotation efforts, which require extensive time and +resources to collect the necessary data. Prior art attempts to automatically +discover subgroups to circumvent these constraints but typically leverages +model behavior on existing task-specific annotations and rapidly degrades on +more complex inputs beyond "tabular" data, none of which study +vision-and-language models. This paper presents VLSlice, an interactive system +enabling user-guided discovery of coherent representation-level subgroups with +consistent visiolinguistic behavior, denoted as vision-and-language slices, +from unlabeled image sets. We show that VLSlice enables users to quickly +generate diverse high-coherency slices in a user study (n=22) and release the +tool publicly. + +
+
+ comment: Conference paper at ICCV 2023. 17 pages, 11 figures. + https://ericslyman.com/vlslice/ +
+
+
+
+
+ + ☆ Transparent Object Tracking with Enhanced Fusion Module IROS 2023 + + +
+ Accurate tracking of transparent objects, such as glasses, plays a critical +role in many robotic tasks such as robot-assisted living. Due to the adaptive +and often reflective texture of such objects, traditional tracking algorithms +that rely on general-purpose learned features suffer from reduced performance. +Recent research has proposed to instill transparency awareness into existing +general object trackers by fusing purpose-built features. However, with the +existing fusion techniques, the addition of new features causes a change in the +latent space making it impossible to incorporate transparency awareness on +trackers with fixed latent spaces. For example, many of the current days +transformer-based trackers are fully pre-trained and are sensitive to any +latent space perturbations. In this paper, we present a new feature fusion +technique that integrates transparency information into a fixed feature space, +enabling its use in a broader range of trackers. Our proposed fusion module, +composed of a transformer encoder and an MLP module, leverages key query-based +transformations to embed the transparency information into the tracking +pipeline. We also present a new two-step training strategy for our fusion +module to effectively merge transparency features. We propose a new tracker +architecture that uses our fusion techniques to achieve superior results for +transparent object tracking. Our proposed method achieves competitive results +with state-of-the-art trackers on TOTB, which is the largest transparent object +tracking benchmark recently released. Our results and the implementation of +code will be made publicly available at https://github.com/kalyan0510/TOTEM. + +
+
+ comment: IEEE IROS 2023 +
+
+
+
+
+ + ☆ STUPD: A Synthetic Dataset for Spatial and Temporal Relation Reasoning + + +
+ Understanding relations between objects is crucial for understanding the +semantics of a visual scene. It is also an essential step in order to bridge +visual and language models. However, current state-of-the-art computer vision +models still lack the ability to perform spatial reasoning well. Existing +datasets mostly cover a relatively small number of spatial relations, all of +which are static relations that do not intrinsically involve motion. In this +paper, we propose the Spatial and Temporal Understanding of Prepositions +Dataset (STUPD) -- a large-scale video dataset for understanding static and +dynamic spatial relationships derived from prepositions of the English +language. The dataset contains 150K visual depictions (videos and images), +consisting of 30 distinct spatial prepositional senses, in the form of object +interaction simulations generated synthetically using Unity3D. In addition to +spatial relations, we also propose 50K visual depictions across 10 temporal +relations, consisting of videos depicting event/time-point interactions. To our +knowledge, no dataset exists that represents temporal relations through visual +settings. In this dataset, we also provide 3D information about object +interactions such as frame-wise coordinates, and descriptions of the objects +used. The goal of this synthetic dataset is to help models perform better in +visual relationship detection in real-world settings. We demonstrate an +increase in the performance of various models over 2 real-world datasets +(ImageNet-VidVRD and Spatial Senses) when pretrained on the STUPD dataset, in +comparison to other pretraining datasets. + +
+
+ comment: Submitted to Neurips Dataset track. 24 pages including citations and + appendix +
+
+
+
+
+ + ☆ SHARM: Segmented Head Anatomical Reference Models + + +
+ Reliable segmentation of anatomical tissues of human head is a major step in +several clinical applications such as brain mapping, surgery planning and +associated computational simulation studies. Segmentation is based on +identifying different anatomical structures through labeling different tissues +through medical imaging modalities. The segmentation of brain structures is +commonly feasible with several remarkable contributions mainly for medical +perspective; however, non-brain tissues are of less interest due to anatomical +complexity and difficulties to be observed using standard medical imaging +protocols. The lack of whole head segmentation methods and unavailability of +large human head segmented datasets limiting the variability studies, +especially in the computational evaluation of electrical brain stimulation +(neuromodulation), human protection from electromagnetic field, and +electroencephalography where non-brain tissues are of great importance. + To fill this gap, this study provides an open-access Segmented Head +Anatomical Reference Models (SHARM) that consists of 196 subjects. These models +are segmented into 15 different tissues; skin, fat, muscle, skull cancellous +bone, skull cortical bone, brain white matter, brain gray matter, cerebellum +white matter, cerebellum gray matter, cerebrospinal fluid, dura, vitreous +humor, lens, mucous tissue and blood vessels. The segmented head models are +generated using open-access IXI MRI dataset through convolutional neural +network structure named ForkNet+. Results indicate a high consistency in +statistical characteristics of different tissue distribution in age scale with +real measurements. SHARM is expected to be a useful benchmark not only for +electromagnetic dosimetry studies but also for different human head +segmentation applications. + +
+
+
+
+
+ + ☆ ShaDocFormer: A Shadow-attentive Threshold Detector with Cascaded Fusion + Refiner for document shadow removal' to the ICASSP 2024 online submission + system + + +
+ Document shadow is a common issue that arise when capturing documents using +mobile devices, which significantly impacts the readability. Current methods +encounter various challenges including inaccurate detection of shadow masks and +estimation of illumination. In this paper, we propose ShaDocFormer, a +Transformer-based architecture that integrates traditional methodologies and +deep learning techniques to tackle the problem of document shadow removal. The +ShaDocFormer architecture comprises two components: the Shadow-attentive +Threshold Detector (STD) and the Cascaded Fusion Refiner (CFR). The STD module +employs a traditional thresholding technique and leverages the attention +mechanism of the Transformer to gather global information, thereby enabling +precise detection of shadow masks. The cascaded and aggregative structure of +the CFR module facilitates a coarse-to-fine restoration process for the entire +image. As a result, ShaDocFormer excels in accurately detecting and capturing +variations in both shadow and illumination, thereby enabling effective removal +of shadows. Extensive experiments demonstrate that ShaDocFormer outperforms +current state-of-the-art methods in both qualitative and quantitative +measurements. + +
+
+
+
+
+ + ☆ Generalizable Neural Fields as Partially Observed Neural Processes ICCV 2023 + + +
+ Neural fields, which represent signals as a function parameterized by a +neural network, are a promising alternative to traditional discrete vector or +grid-based representations. Compared to discrete representations, neural +representations both scale well with increasing resolution, are continuous, and +can be many-times differentiable. However, given a dataset of signals that we +would like to represent, having to optimize a separate neural field for each +signal is inefficient, and cannot capitalize on shared information or +structures among signals. Existing generalization methods view this as a +meta-learning problem and employ gradient-based meta-learning to learn an +initialization which is then fine-tuned with test-time optimization, or learn +hypernetworks to produce the weights of a neural field. We instead propose a +new paradigm that views the large-scale training of neural representations as a +part of a partially-observed neural process framework, and leverage neural +process algorithms to solve this task. We demonstrate that this approach +outperforms both state-of-the-art gradient-based meta-learning approaches and +hypernetwork approaches. + +
+
+ comment: To appear ICCV 2023 +
+
+
+
+
+ + ☆ Event-Driven Imaging in Turbid Media: A Confluence of Optoelectronics + and Neuromorphic Computation + + +
+ In this paper a new optical-computational method is introduced to unveil +images of targets whose visibility is severely obscured by light scattering in +dense, turbid media. The targets of interest are taken to be dynamic in that +their optical properties are time-varying whether stationary in space or +moving. The scheme, to our knowledge the first of its kind, is human vision +inspired whereby diffuse photons collected from the turbid medium are first +transformed to spike trains by a dynamic vision sensor as in the retina, and +image reconstruction is then performed by a neuromorphic computing approach +mimicking the brain. We combine benchtop experimental data in both reflection +(backscattering) and transmission geometries with support from physics-based +simulations to develop a neuromorphic computational model and then apply this +for image reconstruction of different MNIST characters and image sets by a +dedicated deep spiking neural network algorithm. Image reconstruction is +achieved under conditions of turbidity where an original image is +unintelligible to the human eye or a digital video camera, yet clearly and +quantifiable identifiable when using the new neuromorphic computational +approach. + +
+
+
+
+
+ + ☆ Reliability-based cleaning of noisy training labels with inductive + conformal prediction in multi-modal biomedical data mining + + +
+ Accurately labeling biomedical data presents a challenge. Traditional +semi-supervised learning methods often under-utilize available unlabeled data. +To address this, we propose a novel reliability-based training data cleaning +method employing inductive conformal prediction (ICP). This method capitalizes +on a small set of accurately labeled training data and leverages ICP-calculated +reliability metrics to rectify mislabeled data and outliers within vast +quantities of noisy training data. The efficacy of the method is validated +across three classification tasks within distinct modalities: filtering +drug-induced-liver-injury (DILI) literature with title and abstract, predicting +ICU admission of COVID-19 patients through CT radiomics and electronic health +records, and subtyping breast cancer using RNA-sequencing data. Varying levels +of noise to the training labels were introduced through label permutation. +Results show significant enhancements in classification performance: accuracy +enhancement in 86 out of 96 DILI experiments (up to 11.4%), AUROC and AUPRC +enhancements in all 48 COVID-19 experiments (up to 23.8% and 69.8%), and +accuracy and macro-average F1 score improvements in 47 out of 48 RNA-sequencing +experiments (up to 74.6% and 89.0%). Our method offers the potential to +substantially boost classification performance in multi-modal biomedical +machine learning tasks. Importantly, it accomplishes this without necessitating +an excessive volume of meticulously curated training data. + +
+
+
+
+
+ + ☆ Automated Assessment of Critical View of Safety in Laparoscopic + Cholecystectomy + + +
+ Cholecystectomy (gallbladder removal) is one of the most common procedures in +the US, with more than 1.2M procedures annually. Compared with classical open +cholecystectomy, laparoscopic cholecystectomy (LC) is associated with +significantly shorter recovery period, and hence is the preferred method. +However, LC is also associated with an increase in bile duct injuries (BDIs), +resulting in significant morbidity and mortality. The primary cause of BDIs +from LCs is misidentification of the cystic duct with the bile duct. Critical +view of safety (CVS) is the most effective of safety protocols, which is said +to be achieved during the surgery if certain criteria are met. However, due to +suboptimal understanding and implementation of CVS, the BDI rates have remained +stable over the last three decades. In this paper, we develop deep-learning +techniques to automate the assessment of CVS in LCs. An innovative aspect of +our research is on developing specialized learning techniques by incorporating +domain knowledge to compensate for the limited training data available in +practice. In particular, our CVS assessment process involves a fusion of two +segmentation maps followed by an estimation of a certain region of interest +based on anatomical structures close to the gallbladder, and then finally +determination of each of the three CVS criteria via rule-based assessment of +structural information. We achieved a gain of over 11.8% in mIoU on relevant +classes with our two-stream semantic segmentation approach when compared to a +single-model baseline, and 1.84% in mIoU with our proposed Sobel loss function +when compared to a Transformer-based baseline model. For CVS criteria, we +achieved up to 16% improvement and, for the overall CVS assessment, we achieved +5% improvement in balanced accuracy compared to DeepCVS under the same +experiment settings. + +
+
+
+
+
+ + ☆ $\texttt{NePhi}$: Neural Deformation Fields for Approximately + Diffeomorphic Medical Image Registration + + +
+ This work proposes $\texttt{NePhi}$, a neural deformation model which results +in approximately diffeomorphic transformations. In contrast to the predominant +voxel-based approaches, $\texttt{NePhi}$ represents deformations functionally +which allows for memory-efficient training and inference. This is of particular +importance for large volumetric registrations. Further, while medical image +registration approaches representing transformation maps via multi-layer +perceptrons have been proposed, $\texttt{NePhi}$ facilitates both pairwise +optimization-based registration $\textit{as well as}$ learning-based +registration via predicted or optimized global and local latent codes. Lastly, +as deformation regularity is a highly desirable property for most medical image +registration tasks, $\texttt{NePhi}$ makes use of gradient inverse consistency +regularization which empirically results in approximately diffeomorphic +transformations. We show the performance of $\texttt{NePhi}$ on two 2D +synthetic datasets as well as on real 3D lung registration. Our results show +that $\texttt{NePhi}$ can achieve similar accuracies as voxel-based +representations in a single-resolution registration setting while using less +memory and allowing for faster instance-optimization. + +
+
+
+
+
+ + ☆ Multi-Modal Hybrid Learning and Sequential Training for RGB-T Saliency + Detection + + +
+ RGB-T saliency detection has emerged as an important computer vision task, +identifying conspicuous objects in challenging scenes such as dark +environments. However, existing methods neglect the characteristics of +cross-modal features and rely solely on network structures to fuse RGB and +thermal features. To address this, we first propose a Multi-Modal Hybrid loss +(MMHL) that comprises supervised and self-supervised loss functions. The +supervised loss component of MMHL distinctly utilizes semantic features from +different modalities, while the self-supervised loss component reduces the +distance between RGB and thermal features. We further consider both spatial and +channel information during feature fusion and propose the Hybrid Fusion Module +to effectively fuse RGB and thermal features. Lastly, instead of jointly +training the network with cross-modal features, we implement a sequential +training strategy which performs training only on RGB images in the first stage +and then learns cross-modal features in the second stage. This training +strategy improves saliency detection performance without computational +overhead. Results from performance evaluation and ablation studies demonstrate +the superior performance achieved by the proposed method compared with the +existing state-of-the-art methods. + +
+
+ comment: 8 Pages main text, 3 pages supplementary information, 12 figures +
+
+
+
+
+ + ☆ GAN-based Algorithm for Efficient Image Inpainting + + +
+ Global pandemic due to the spread of COVID-19 has post challenges in a new +dimension on facial recognition, where people start to wear masks. Under such +condition, the authors consider utilizing machine learning in image inpainting +to tackle the problem, by complete the possible face that is originally covered +in mask. In particular, autoencoder has great potential on retaining important, +general features of the image as well as the generative power of the generative +adversarial network (GAN). The authors implement a combination of the two +models, context encoders and explain how it combines the power of the two +models and train the model with 50,000 images of influencers faces and yields a +solid result that still contains space for improvements. Furthermore, the +authors discuss some shortcomings with the model, their possible improvements, +as well as some area of study for future investigation for applicative +perspective, as well as directions to further enhance and refine the model. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ Unbiased Face Synthesis With Diffusion Models: Are We There Yet? + + +
+ Text-to-image diffusion models have achieved widespread popularity due to +their unprecedented image generation capability. In particular, their ability +to synthesize and modify human faces has spurred research into using generated +face images in both training data augmentation and model performance +assessments. In this paper, we study the efficacy and shortcomings of +generative models in the context of face generation. Utilizing a combination of +qualitative and quantitative measures, including embedding-based metrics and +user studies, we present a framework to audit the characteristics of generated +faces conditioned on a set of social attributes. We applied our framework on +faces generated through state-of-the-art text-to-image diffusion models. We +identify several limitations of face image generation that include faithfulness +to the text prompt, demographic disparities, and distributional shifts. +Furthermore, we present an analytical model that provides insights into how +training data selection contributes to the performance of generative models. + +
+
+
+
+
+ + ☆ So you think you can track? + + +
+ This work introduces a multi-camera tracking dataset consisting of 234 hours +of video data recorded concurrently from 234 overlapping HD cameras covering a +4.2 mile stretch of 8-10 lane interstate highway near Nashville, TN. The video +is recorded during a period of high traffic density with 500+ objects typically +visible within the scene and typical object longevities of 3-15 minutes. GPS +trajectories from 270 vehicle passes through the scene are manually corrected +in the video data to provide a set of ground-truth trajectories for +recall-oriented tracking metrics, and object detections are provided for each +camera in the scene (159 million total before cross-camera fusion). Initial +benchmarking of tracking-by-detection algorithms is performed against the GPS +trajectories, and a best HOTA of only 9.5% is obtained (best recall 75.9% at +IOU 0.1, 47.9 average IDs per ground truth object), indicating the benchmarked +trackers do not perform sufficiently well at the long temporal and spatial +durations required for traffic scene understanding. + +
+
+
+
+
+ + ☆ Automated segmentation of rheumatoid arthritis immunohistochemistry + stained synovial tissue + + +
+ Rheumatoid Arthritis (RA) is a chronic, autoimmune disease which primarily +affects the joint's synovial tissue. It is a highly heterogeneous disease, with +wide cellular and molecular variability observed in synovial tissues. Over the +last two decades, the methods available for their study have advanced +considerably. In particular, Immunohistochemistry stains are well suited to +highlighting the functional organisation of samples. Yet, analysis of +IHC-stained synovial tissue samples is still overwhelmingly done manually and +semi-quantitatively by expert pathologists. This is because in addition to the +fragmented nature of IHC stained synovial tissue, there exist wide variations +in intensity and colour, strong clinical centre batch effect, as well as the +presence of many undesirable artefacts present in gigapixel Whole Slide Images +(WSIs), such as water droplets, pen annotation, folded tissue, blurriness, etc. +There is therefore a strong need for a robust, repeatable automated tissue +segmentation algorithm which can cope with this variability and provide support +to imaging pipelines. We train a UNET on a hand-curated, heterogeneous +real-world multi-centre clinical dataset R4RA, which contains multiple types of +IHC staining. The model obtains a DICE score of 0.865 and successfully segments +different types of IHC staining, as well as dealing with variance in colours, +intensity and common WSIs artefacts from the different clinical centres. It can +be used as the first step in an automated image analysis pipeline for synovial +tissue samples stained with IHC, increasing speed, reproducibility and +robustness. + +
+
+
+
+
+ + ☆ Mitigate Replication and Copying in Diffusion Models with Generalized + Caption and Dual Fusion Enhancement + + +
+ While diffusion models demonstrate a remarkable capability for generating +high-quality images, their tendency to `replicate' training data raises privacy +concerns. Although recent research suggests that this replication may stem from +the insufficient generalization of training data captions and duplication of +training images, effective mitigation strategies remain elusive. To address +this gap, our paper first introduces a generality score that measures the +caption generality and employ large language model (LLM) to generalize training +captions. Subsequently, we leverage generalized captions and propose a novel +dual fusion enhancement approach to mitigate the replication of diffusion +models. Our empirical results demonstrate that our proposed methods can +significantly reduce replication by 43.5% compared to the original diffusion +model while maintaining the diversity and quality of generations. + +
+
+
+
+
+ + ☆ LInKs "Lifting Independent Keypoints" -- Partial Pose Lifting for + Occlusion Handling with Improved Accuracy in 2D-3D Human Pose Estimation + + +
+ We present LInKs, a novel unsupervised learning method to recover 3D human +poses from 2D kinematic skeletons obtained from a single image, even when +occlusions are present. Our approach follows a unique two-step process, which +involves first lifting the occluded 2D pose to the 3D domain, followed by +filling in the occluded parts using the partially reconstructed 3D coordinates. +This lift-then-fill approach leads to significantly more accurate results +compared to models that complete the pose in 2D space alone. Additionally, we +improve the stability and likelihood estimation of normalising flows through a +custom sampling function replacing PCA dimensionality reduction previously used +in prior work. Furthermore, we are the first to investigate if different parts +of the 2D kinematic skeleton can be lifted independently which we find by +itself reduces the error of current lifting approaches. We attribute this to +the reduction of long-range keypoint correlations. In our detailed evaluation, +we quantify the error under various realistic occlusion scenarios, showcasing +the versatility and applicability of our model. Our results consistently +demonstrate the superiority of handling all types of occlusions in 3D space +when compared to others that complete the pose in 2D space. Our approach also +exhibits consistent accuracy in scenarios without occlusion, as evidenced by a +7.9% reduction in reconstruction error compared to prior works on the Human3.6M +dataset. Furthermore, our method excels in accurately retrieving complete 3D +poses even in the presence of occlusions, making it highly applicable in +situations where complete 2D pose information is unavailable. + +
+
+
+
+
+ + ♻ ☆ Edge-MoE: Memory-Efficient Multi-Task Vision Transformer Architecture + with Task-level Sparsity via Mixture-of-Experts + + +
+ Computer vision researchers are embracing two promising paradigms: Vision +Transformers (ViTs) and Multi-task Learning (MTL), which both show great +performance but are computation-intensive, given the quadratic complexity of +self-attention in ViT and the need to activate an entire large MTL model for +one task. M$^3$ViT is the latest multi-task ViT model that introduces +mixture-of-experts (MoE), where only a small portion of subnetworks ("experts") +are sparsely and dynamically activated based on the current task. M$^3$ViT +achieves better accuracy and over 80% computation reduction but leaves +challenges for efficient deployment on FPGA. + Our work, dubbed Edge-MoE, solves the challenges to introduce the first +end-to-end FPGA accelerator for multi-task ViT with a collection of +architectural innovations, including (1) a novel reordering mechanism for +self-attention, which requires only constant bandwidth regardless of the target +parallelism; (2) a fast single-pass softmax approximation; (3) an accurate and +low-cost GELU approximation; (4) a unified and flexible computing unit that is +shared by almost all computational layers to maximally reduce resource usage; +and (5) uniquely for M$^3$ViT, a novel patch reordering method to eliminate +memory access overhead. Edge-MoE achieves 2.24x and 4.90x better energy +efficiency comparing with GPU and CPU, respectively. A real-time video +demonstration is available online, along with our open-source code written +using High-Level Synthesis. + +
+
+ comment: 11 pages, 12 figures. Accepted at ICCAD 2023 +
+
+
+
+
+ + ♻ ☆ Boost Video Frame Interpolation via Motion Adaptation BMVC 2023 + + +
+ Video frame interpolation (VFI) is a challenging task that aims to generate +intermediate frames between two consecutive frames in a video. Existing +learning-based VFI methods have achieved great success, but they still suffer +from limited generalization ability due to the limited motion distribution of +training datasets. In this paper, we propose a novel optimization-based VFI +method that can adapt to unseen motions at test time. Our method is based on a +cycle-consistency adaptation strategy that leverages the motion characteristics +among video frames. We also introduce a lightweight adapter that can be +inserted into the motion estimation module of existing pre-trained VFI models +to improve the efficiency of adaptation. Extensive experiments on various +benchmarks demonstrate that our method can boost the performance of two-frame +VFI models, outperforming the existing state-of-the-art methods, even those +that use extra input. + +
+
+ comment: Accepted by BMVC 2023 (Oral Presentation) +
+
+
+
+
+ + ♻ ☆ AnomalyGPT: Detecting Industrial Anomalies using Large Vision-Language + Models + + +
+ Large Vision-Language Models (LVLMs) such as MiniGPT-4 and LLaVA have +demonstrated the capability of understanding images and achieved remarkable +performance in various visual tasks. Despite their strong abilities in +recognizing common objects due to extensive training datasets, they lack +specific domain knowledge and have a weaker understanding of localized details +within objects, which hinders their effectiveness in the Industrial Anomaly +Detection (IAD) task. On the other hand, most existing IAD methods only provide +anomaly scores and necessitate the manual setting of thresholds to distinguish +between normal and abnormal samples, which restricts their practical +implementation. In this paper, we explore the utilization of LVLM to address +the IAD problem and propose AnomalyGPT, a novel IAD approach based on LVLM. We +generate training data by simulating anomalous images and producing +corresponding textual descriptions for each image. We also employ an image +decoder to provide fine-grained semantic and design a prompt learner to +fine-tune the LVLM using prompt embeddings. Our AnomalyGPT eliminates the need +for manual threshold adjustments, thus directly assesses the presence and +locations of anomalies. Additionally, AnomalyGPT supports multi-turn dialogues +and exhibits impressive few-shot in-context learning capabilities. With only +one normal shot, AnomalyGPT achieves the state-of-the-art performance with an +accuracy of 86.1%, an image-level AUC of 94.1%, and a pixel-level AUC of 95.3% +on the MVTec-AD dataset. Code is available at +https://github.com/CASIA-IVA-Lab/AnomalyGPT. + +
+
+ comment: Project page: https://anomalygpt.github.io +
+
+
+
+
+ + ♻ ☆ TMSA: Towards Arbitrary Text-driven Image Manipulation via Space + Alignment + + +
+ The recent GAN inversion methods have been able to successfully invert the +real image input to the corresponding editable latent code in StyleGAN. By +combining with the language-vision model (CLIP), some text-driven image +manipulation methods are proposed. However, these methods require extra costs +to perform optimization for a certain image or a new attribute editing mode. To +achieve a more efficient editing method, we propose a new Text-driven image +Manipulation framework via Space Alignment (TMSA). The Space Alignment module +aims to align the same semantic regions in CLIP and StyleGAN spaces. Then, the +text input can be directly accessed into the StyleGAN space and be used to find +the semantic shift according to the text description. The framework can support +arbitrary image editing mode without additional cost. Our work provides the +user with an interface to control the attributes of a given image according to +text input and get the result in real time. Ex tensive experiments demonstrate +our superior performance over prior works. + +
+
+ comment: 8 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ DWRSeg: Rethinking Efficient Acquisition of Multi-scale Contextual + Information for Real-time Semantic Segmentation + + +
+ Many current works directly adopt multi-rate depth-wise dilated convolutions +to capture multi-scale contextual information simultaneously from one input +feature map, thus improving the feature extraction efficiency for real-time +semantic segmentation. However, this design may lead to difficult access to +multi-scale contextual information because of the unreasonable structure and +hyperparameters. To lower the difficulty of drawing multi-scale contextual +information, we propose a highly efficient multi-scale feature extraction +method, which decomposes the original single-step method into two steps, Region +Residualization-Semantic Residualization. In this method, the multi-rate +depth-wise dilated convolutions take a simpler role in feature extraction: +performing simple semantic-based morphological filtering with one desired +receptive field in the second step based on each concise feature map of region +form provided by the first step, to improve their efficiency. Moreover, the +dilation rates and the capacity of dilated convolutions for each network stage +are elaborated to fully utilize all the feature maps of region form that can be +achieved.Accordingly, we design a novel Dilation-wise Residual (DWR) module and +a Simple Inverted Residual (SIR) module for the high and low level network, +respectively, and form a powerful DWR Segmentation (DWRSeg) network. Extensive +experiments on the Cityscapes and CamVid datasets demonstrate the effectiveness +of our method by achieving a state-of-the-art trade-off between accuracy and +inference speed, in addition to being lighter weight. Without pretraining or +resorting to any training trick, we achieve an mIoU of 72.7% on the Cityscapes +test set at a speed of 319.5 FPS on one NVIDIA GeForce GTX 1080 Ti card, which +exceeds the latest methods of a speed of 69.5 FPS and 0.8% mIoU. The code and +trained models are publicly available. + +
+
+
+
+
+ + ♻ ☆ Event and Entity Extraction from Generated Video Captions + + +
+ Annotation of multimedia data by humans is time-consuming and costly, while +reliable automatic generation of semantic metadata is a major challenge. We +propose a framework to extract semantic metadata from automatically generated +video captions. As metadata, we consider entities, the entities' properties, +relations between entities, and the video category. We employ two +state-of-the-art dense video captioning models with masked transformer (MT) and +parallel decoding (PVDC) to generate captions for videos of the ActivityNet +Captions dataset. Our experiments show that it is possible to extract entities, +their properties, relations between entities, and the video category from the +generated captions. We observe that the quality of the extracted information is +mainly influenced by the quality of the event localization in the video as well +as the performance of the event caption generation. + +
+
+ comment: Paper accepted at CD-MAKE 2023 +
+
+
+
+
+ + ♻ ☆ Deep Visual-Genetic Biometrics for Taxonomic Classification of Rare + Species + + +
+ Visual as well as genetic biometrics are routinely employed to identify +species and individuals in biological applications. However, no attempts have +been made in this domain to computationally enhance visual classification of +rare classes with little image data via genetics. In this paper, we thus +propose aligned visual-genetic inference spaces with the aim to implicitly +encode cross-domain associations for improved performance. We demonstrate for +the first time that such alignment can be achieved via deep embedding models +and that the approach is directly applicable to boosting long-tailed +recognition (LTR) particularly for rare species. We experimentally demonstrate +the efficacy of the concept via application to microscopic imagery of 30k+ +planktic foraminifer shells across 32 species when used together with +independent genetic data samples. Most importantly for practitioners, we show +that visual-genetic alignment can significantly benefit visual-only recognition +of the rarest species. Technically, we pre-train a visual ResNet50 deep +learning model using triplet loss formulations to create an initial embedding +space. We re-structure this space based on genetic anchors embedded via a +Sequence Graph Transform (SGT) and linked to visual data by cross-domain cosine +alignment. We show that an LTR approach improves the state-of-the-art across +all benchmarks and that adding our visual-genetic alignment improves per-class +and particularly rare tail class benchmarks significantly further. We conclude +that visual-genetic alignment can be a highly effective tool for complementing +visual biological data containing rare classes. The concept proposed may serve +as an important future tool for integrating genetics and imageomics towards a +more complete scientific representation of taxonomic spaces and life itself. +Code, weights, and data splits are published for full reproducibility. + +
+
+
+
+
+ + ♻ ☆ RFDforFin: Robust Deep Forgery Detection for GAN-generated Fingerprint + Images + + +
+ With the rapid development of the image generation technologies, the +malicious abuses of the GAN-generated fingerprint images poses a significant +threat to the public safety in certain circumstances. Although the existing +universal deep forgery detection approach can be applied to detect the fake +fingerprint images, they are easily attacked and have poor robustness. +Meanwhile, there is no specifically designed deep forgery detection method for +fingerprint images. In this paper, we propose the first deep forgery detection +approach for fingerprint images, which combines unique ridge features of +fingerprint and generation artifacts of the GAN-generated images, to the best +of our knowledge. Specifically, we firstly construct a ridge stream, which +exploits the grayscale variations along the ridges to extract unique +fingerprint-specific features. Then, we construct a generation artifact stream, +in which the FFT-based spectrums of the input fingerprint images are exploited, +to extract more robust generation artifact features. At last, the unique ridge +features and generation artifact features are fused for binary classification +(i.e., real or fake). Comprehensive experiments demonstrate that our proposed +approach is effective and robust with low complexities. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Edge-aware Hard Clustering Graph Pooling for Brain Imaging Data + + +
+ Graph Convolutional Networks (GCNs) can capture non-Euclidean spatial +dependence between different brain regions, and the graph pooling operator in +GCNs is key to enhancing the representation learning capability and acquiring +abnormal brain maps. However, the majority of existing research designs graph +pooling operators only from the perspective of nodes while disregarding the +original edge features, in a way that not only confines graph pooling +application scenarios, but also diminishes its ability to capture critical +substructures. In this study, a clustering graph pooling method that first +supports multidimensional edge features, called Edge-aware hard clustering +graph pooling (EHCPool), is developed. EHCPool proposes the first +'Edge-to-node' score evaluation criterion based on edge features to assess node +feature significance. To more effectively capture the critical subgraphs, a +novel Iteration n-top strategy is further designed to adaptively learn sparse +hard clustering assignments for graphs. Subsequently, an innovative N-E +Aggregation strategy is presented to aggregate node and edge feature +information in each independent subgraph. The proposed model was evaluated on +multi-site brain imaging public datasets and yielded state-of-the-art +performance. We believe this method is the first deep learning tool with the +potential to probe different types of abnormal functional brain networks from +data-driven perspective. Core code is at: https://github.com/swfen/EHCPool. + +
+
+
+
+
+ + ♻ ☆ IC3D: Image-Conditioned 3D Diffusion for Shape Generation + + +
+ In recent years, Denoising Diffusion Probabilistic Models (DDPMs) have +demonstrated exceptional performance in various 2D generative tasks. Following +this success, DDPMs have been extended to 3D shape generation, surpassing +previous methodologies in this domain. While many of these models are +unconditional, some have explored the potential of using guidance from +different modalities. In particular, image guidance for 3D generation has been +explored through the utilization of CLIP embeddings. However, these embeddings +are designed to align images and text, and do not necessarily capture the +specific details needed for shape generation. To address this limitation and +enhance image-guided 3D DDPMs with augmented 3D understanding, we introduce +CISP (Contrastive Image-Shape Pre-training), obtaining a well-structured +image-shape joint embedding space. Building upon CISP, we then introduce IC3D, +a DDPM that harnesses CISP's guidance for 3D shape generation from single-view +images. This generative diffusion model outperforms existing benchmarks in both +quality and diversity of generated 3D shapes. Moreover, despite IC3D's +generative nature, its generated shapes are preferred by human evaluators over +a competitive single-view 3D reconstruction model. These properties contribute +to a coherent embedding space, enabling latent interpolation and conditioned +generation also from out-of-distribution images. We find IC3D able to generate +coherent and diverse completions also when presented with occluded views, +rendering it applicable in controlled real-world scenarios. + +
+
+ comment: 9 pages, 10 figures; appendix 6 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Action Sensitivity Learning for Temporal Action Localization ICCV 2023 + + +
+ Temporal action localization (TAL), which involves recognizing and locating +action instances, is a challenging task in video understanding. Most existing +approaches directly predict action classes and regress offsets to boundaries, +while overlooking the discrepant importance of each frame. In this paper, we +propose an Action Sensitivity Learning framework (ASL) to tackle this task, +which aims to assess the value of each frame and then leverage the generated +action sensitivity to recalibrate the training procedure. We first introduce a +lightweight Action Sensitivity Evaluator to learn the action sensitivity at the +class level and instance level, respectively. The outputs of the two branches +are combined to reweight the gradient of the two sub-tasks. Moreover, based on +the action sensitivity of each frame, we design an Action Sensitive Contrastive +Loss to enhance features, where the action-aware frames are sampled as positive +pairs to push away the action-irrelevant frames. The extensive studies on +various action localization benchmarks (i.e., MultiThumos, Charades, +Ego4D-Moment Queries v1.0, Epic-Kitchens 100, Thumos14 and ActivityNet1.3) show +that ASL surpasses the state-of-the-art in terms of average-mAP under multiple +types of scenarios, e.g., single-labeled, densely-labeled and egocentric. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Degree-Controllable Lightweight Fast Style Transfer with Detail + Attention-enhanced + + +
+ Style transfer methods usually use pre-trained VGG or more complex models as +encoders to achieve better effects. This leads to extremely slow processing of +high-resolution images. To solve the problem, we propose an degree-controllable +detail attention-enhanced lightweight fast style transfer (DcDaeLFST), which +adopts a small, shallow, and compact architecture for efficient forward +inference. Additionally, our exploit a global semantic invariance loss to +preserve the semantic and structural information of content images, and a local +detail attention-enhanced module to preserve the detail information of them, +together with a style discriminator. Despite limited parameters, it can achieve +overall better style matching performance. Most importantly, it is the first +method that can control the degree of detail retention and style transfer based +on subjective evaluation. In comparative experiments, our model is 17-250 times +smaller and 0.26-6.5 times faster than other state-of-the-art models, with the +fastest processing speed of 0.38s on 4K high-resolution images. + +
+
+
+
+
+ + ♻ ☆ Can you text what is happening? Integrating pre-trained language + encoders into trajectory prediction models for autonomous driving + + +
+ In autonomous driving tasks, scene understanding is the first step towards +predicting the future behavior of the surrounding traffic participants. Yet, +how to represent a given scene and extract its features are still open research +questions. In this study, we propose a novel text-based representation of +traffic scenes and process it with a pre-trained language encoder. + First, we show that text-based representations, combined with classical +rasterized image representations, lead to descriptive scene embeddings. Second, +we benchmark our predictions on the nuScenes dataset and show significant +improvements compared to baselines. Third, we show in an ablation study that a +joint encoder of text and rasterized images outperforms the individual encoders +confirming that both representations have their complementary strengths. + +
+
+
+
+
+ + ♻ ☆ Neural Gradient Regularizer + + +
+ Owing to its significant success, the prior imposed on gradient maps has +consistently been a subject of great interest in the field of image processing. +Total variation (TV), one of the most representative regularizers, is known for +its ability to capture the intrinsic sparsity prior underlying gradient maps. +Nonetheless, TV and its variants often underestimate the gradient maps, leading +to the weakening of edges and details whose gradients should not be zero in the +original image (i.e., image structures is not describable by sparse priors of +gradient maps). Recently, total deep variation (TDV) has been introduced, +assuming the sparsity of feature maps, which provides a flexible regularization +learned from large-scale datasets for a specific task. However, TDV requires to +retrain the network with image/task variations, limiting its versatility. To +alleviate this issue, in this paper, we propose a neural gradient regularizer +(NGR) that expresses the gradient map as the output of a neural network. Unlike +existing methods, NGR does not rely on any subjective sparsity or other prior +assumptions on image gradient maps, thereby avoiding the underestimation of +gradient maps. NGR is applicable to various image types and different image +processing tasks, functioning in a zero-shot learning fashion, making it a +versatile and plug-and-play regularizer. Extensive experimental results +demonstrate the superior performance of NGR over state-of-the-art counterparts +for a range of different tasks, further validating its effectiveness and +versatility. + +
+
+
+
+
+ + ♻ ☆ Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT + by Integrating Neural Distance and Texture-Aware Transformer MICCAI 2023 + + +
+ Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which +the tumor-vascular involvement greatly affects the resectability and, thus, +overall survival of patients. However, current prognostic prediction methods +fail to explicitly and accurately investigate relationships between the tumor +and nearby important vessels. This paper proposes a novel learnable neural +distance that describes the precise relationship between the tumor and vessels +in CT images of different patients, adopting it as a major feature for +prognosis prediction. Besides, different from existing models that used CNNs or +LSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT +imaging, we improved the extraction of dynamic tumor-related texture features +in multi-phase contrast-enhanced CT by fusing local and global features using +CNN and transformer modules, further enhancing the features extracted across +multi-phase CT images. We extensively evaluated and compared the proposed +method with existing methods in the multi-center (n=4) dataset with 1,070 +patients with PDAC, and statistical analysis confirmed its clinical +effectiveness in the external test set consisting of three centers. The +developed risk marker was the strongest predictor of overall survival among +preoperative factors and it has the potential to be combined with established +clinical factors to select patients at higher risk who might benefit from +neoadjuvant therapy. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels + + +
+ A noisy training set usually leads to the degradation of the generalization +and robustness of neural networks. In this paper, we propose a novel +theoretically guaranteed clean sample selection framework for learning with +noisy labels. Specifically, we first present a Scalable Penalized Regression +(SPR) method, to model the linear relation between network features and one-hot +labels. In SPR, the clean data are identified by the zero mean-shift parameters +solved in the regression model. We theoretically show that SPR can recover +clean data under some conditions. Under general scenarios, the conditions may +be no longer satisfied; and some noisy data are falsely selected as clean data. +To solve this problem, we propose a data-adaptive method for Scalable Penalized +Regression with Knockoff filters (Knockoffs-SPR), which is provable to control +the False-Selection-Rate (FSR) in the selected clean data. To improve the +efficiency, we further present a split algorithm that divides the whole +training set into small pieces that can be solved in parallel to make the +framework scalable to large datasets. While Knockoffs-SPR can be regarded as a +sample selection module for a standard supervised training pipeline, we further +combine it with a semi-supervised algorithm to exploit the support of noisy +data as unlabeled data. Experimental results on several benchmark datasets and +real-world noisy datasets show the effectiveness of our framework and validate +the theoretical results of Knockoffs-SPR. Our code and pre-trained models are +available at https://github.com/Yikai-Wang/Knockoffs-SPR. + +
+
+ comment: update: refined theory and analysis, release code +
+
+
+
+
+ + ♻ ☆ Batch Implicit Neural Representation for MRI Parallel Reconstruction + + +
+ Magnetic resonance imaging (MRI) always suffered from the problem of long +acquisition time. MRI reconstruction is one solution to reduce scan time by +skipping certain phase-encoding lines and then restoring high-quality images +from undersampled measurements. Recently, implicit neural representation (INR) +has emerged as a new deep learning method that represents an object as a +continuous function of spatial coordinates, and this function is normally +parameterized by a multilayer perceptron (MLP). In this paper, we propose a +novel MRI reconstruction method based on INR, which represents the +fully-sampled images as the function of pixel coordinates and prior feature +vectors of undersampled images for overcoming the generalization problem of +INR. Specifically, we introduce a scale-embedded encoder to produce +scale-independent pixel-specific features from MR images with different +undersampled scales and then concatenate with coordinates vectors to recover +fully-sampled MR images via an MLP, thus achieving arbitrary scale +reconstruction. The performance of the proposed method was assessed by +experimenting on publicly available MRI datasets and compared with other +reconstruction methods. Our quantitative evaluation demonstrates the +superiority of the proposed method over alternative reconstruction methods. + +
+
+
+
+
+ + ♻ ☆ C-RITNet: Set Infrared and Visible Image Fusion Free from Complementary + Information Mining + + +
+ Infrared and visible image fusion (IVIF) aims to extract and integrate the +complementary information in two different modalities to generate high-quality +fused images with salient targets and abundant texture details. However, +current image fusion methods go to great lengths to excavate complementary +features, which is generally achieved through two efforts. On the one hand, the +feature extraction network is expected to have excellent performance in +extracting complementary information. On the other hand, complex fusion +strategies are often designed to aggregate the complementary information. In +other words, enabling the network to perceive and extract complementary +information is extremely challenging. Complicated fusion strategies, while +effective, still run the risk of losing weak edge details. To this end, this +paper rethinks the IVIF outside the box, proposing a complementary-redundant +information transfer network (C-RITNet). It reasonably transfers complementary +information into redundant one, which integrates both the shared and +complementary features from two modalities. Hence, the proposed method is able +to alleviate the challenges posed by the complementary information extraction +and reduce the reliance on sophisticated fusion strategies. Specifically, to +skillfully sidestep aggregating complementary information in IVIF, we first +design the mutual information transfer (MIT) module to mutually represent +features from two modalities, roughly transferring complementary information +into redundant one. Then, a redundant information acquisition supervised by +source image (RIASSI) module is devised to further ensure the +complementary-redundant information transfer after MIT. Meanwhile, we also +propose a structure information preservation (SIP) module to guarantee that the +edge structure information of the source images can be transferred to the +fusion results. + +
+
+
+
+
+ + ♻ ☆ Test Time Adaptation for Blind Image Quality Assessment ICCV 2023 + + +
+ While the design of blind image quality assessment (IQA) algorithms has +improved significantly, the distribution shift between the training and testing +scenarios often leads to a poor performance of these methods at inference time. +This motivates the study of test time adaptation (TTA) techniques to improve +their performance at inference time. Existing auxiliary tasks and loss +functions used for TTA may not be relevant for quality-aware adaptation of the +pre-trained model. In this work, we introduce two novel quality-relevant +auxiliary tasks at the batch and sample levels to enable TTA for blind IQA. In +particular, we introduce a group contrastive loss at the batch level and a +relative rank loss at the sample level to make the model quality aware and +adapt to the target data. Our experiments reveal that even using a small batch +of images from the test distribution helps achieve significant improvement in +performance by updating the batch normalization statistics of the source model. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ DETA: Denoised Task Adaptation for Few-Shot Learning + + +
+ Test-time task adaptation in few-shot learning aims to adapt a pre-trained +task-agnostic model for capturing taskspecific knowledge of the test task, rely +only on few-labeled support samples. Previous approaches generally focus on +developing advanced algorithms to achieve the goal, while neglecting the +inherent problems of the given support samples. In fact, with only a handful of +samples available, the adverse effect of either the image noise (a.k.a. +X-noise) or the label noise (a.k.a. Y-noise) from support samples can be +severely amplified. To address this challenge, in this work we propose DEnoised +Task Adaptation (DETA), a first, unified image- and label-denoising framework +orthogonal to existing task adaptation approaches. Without extra supervision, +DETA filters out task-irrelevant, noisy representations by taking advantage of +both global visual information and local region details of support samples. On +the challenging Meta-Dataset, DETA consistently improves the performance of a +broad spectrum of baseline methods applied on various pre-trained models. +Notably, by tackling the overlooked image noise in Meta-Dataset, DETA +establishes new state-of-the-art results. Code is released at +https://github.com/JimZAI/DETA. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Depth-wise Decomposition for Accelerating Separable Convolutions in + Efficient Convolutional Neural Networks + + +
+ Very deep convolutional neural networks (CNNs) have been firmly established +as the primary methods for many computer vision tasks. However, most +state-of-the-art CNNs are large, which results in high inference latency. +Recently, depth-wise separable convolution has been proposed for image +recognition tasks on computationally limited platforms such as robotics and +self-driving cars. Though it is much faster than its counterpart, regular +convolution, accuracy is sacrificed. In this paper, we propose a novel +decomposition approach based on SVD, namely depth-wise decomposition, for +expanding regular convolutions into depthwise separable convolutions while +maintaining high accuracy. We show our approach can be further generalized to +the multi-channel and multi-layer cases, based on Generalized Singular Value +Decomposition (GSVD) [59]. We conduct thorough experiments with the latest +ShuffleNet V2 model [47] on both random synthesized dataset and a large-scale +image recognition dataset: ImageNet [10]. Our approach outperforms channel +decomposition [73] on all datasets. More importantly, our approach improves the +Top-1 accuracy of ShuffleNet V2 by ~2%. + +
+
+
+
+
+ + ♻ ☆ Improving Visual Quality and Transferability of Adversarial Attacks on + Face Recognition Simultaneously with Adversarial Restoration + + +
+ Adversarial face examples possess two critical properties: Visual Quality and +Transferability. However, existing approaches rarely address these properties +simultaneously, leading to subpar results. To address this issue, we propose a +novel adversarial attack technique known as Adversarial Restoration +(AdvRestore), which enhances both visual quality and transferability of +adversarial face examples by leveraging a face restoration prior. In our +approach, we initially train a Restoration Latent Diffusion Model (RLDM) +designed for face restoration. Subsequently, we employ the inference process of +RLDM to generate adversarial face examples. The adversarial perturbations are +applied to the intermediate features of RLDM. Additionally, by treating RLDM +face restoration as a sibling task, the transferability of the generated +adversarial face examples is further improved. Our experimental results +validate the effectiveness of the proposed attack method. + +
+
+ comment: \copyright 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Distilling Cognitive Backdoor Patterns within an Image ICLR2023 + + +
+ This paper proposes a simple method to distill and detect backdoor patterns +within an image: \emph{Cognitive Distillation} (CD). The idea is to extract the +"minimal essence" from an input image responsible for the model's prediction. +CD optimizes an input mask to extract a small pattern from the input image that +can lead to the same model output (i.e., logits or deep features). The +extracted pattern can help understand the cognitive mechanism of a model on +clean vs. backdoor images and is thus called a \emph{Cognitive Pattern} (CP). +Using CD and the distilled CPs, we uncover an interesting phenomenon of +backdoor attacks: despite the various forms and sizes of trigger patterns used +by different attacks, the CPs of backdoor samples are all surprisingly and +suspiciously small. One thus can leverage the learned mask to detect and remove +backdoor examples from poisoned training datasets. We conduct extensive +experiments to show that CD can robustly detect a wide range of advanced +backdoor attacks. We also show that CD can potentially be applied to help +detect potential biases from face datasets. Code is available at +\url{https://github.com/HanxunH/CognitiveDistillation}. + +
+
+ comment: ICLR2023 +
+
+
+
+
+ + ♻ ☆ SAMPLING: Scene-adaptive Hierarchical Multiplane Images Representation + for Novel View Synthesis from a Single Image + + +
+ Recent novel view synthesis methods obtain promising results for relatively +small scenes, e.g., indoor environments and scenes with a few objects, but tend +to fail for unbounded outdoor scenes with a single image as input. In this +paper, we introduce SAMPLING, a Scene-adaptive Hierarchical Multiplane Images +Representation for Novel View Synthesis from a Single Image based on improved +multiplane images (MPI). Observing that depth distribution varies significantly +for unbounded outdoor scenes, we employ an adaptive-bins strategy for MPI to +arrange planes in accordance with each scene image. To represent intricate +geometry and multi-scale details, we further introduce a hierarchical +refinement branch, which results in high-quality synthesized novel views. Our +method demonstrates considerable performance gains in synthesizing large-scale +unbounded outdoor scenes using a single image on the KITTI dataset and +generalizes well to the unseen Tanks and Temples dataset.The code and models +will soon be made available. + +
+
+
+
+
+ + ♻ ☆ Nearest Neighbor Sampling of Point Sets using Rays + + +
+ We propose a new framework for the sampling, compression, and analysis of +distributions of point sets and other geometric objects embedded in Euclidean +spaces. Our approach involves constructing a tensor called the RaySense sketch, +which captures nearest neighbors from the underlying geometry of points along a +set of rays. We explore various operations that can be performed on the +RaySense sketch, leading to different properties and potential applications. +Statistical information about the data set can be extracted from the sketch, +independent of the ray set. Line integrals on point sets can be efficiently +computed using the sketch. We also present several examples illustrating +applications of the proposed strategy in practical scenarios. + +
+
+ comment: 48 pages, 14 figures, accepted to Communication on Applied + Mathematics and Computation (CAMC), Focused Issue in Honor of Prof. Stanley + Osher on the Occasion of His 80th Birthday. Fixed typos and improved + notations +
+
+
+
+
+ + ♻ ☆ Character Time-series Matching For Robust License Plate Recognition + + +
+ Automatic License Plate Recognition (ALPR) is becoming a popular study area +and is applied in many fields such as transportation or smart city. However, +there are still several limitations when applying many current methods to +practical problems due to the variation in real-world situations such as light +changes, unclear License Plate (LP) characters, and image quality. Almost +recent ALPR algorithms process on a single frame, which reduces accuracy in +case of worse image quality. This paper presents methods to improve license +plate recognition accuracy by tracking the license plate in multiple frames. +First, the Adaptive License Plate Rotation algorithm is applied to correctly +align the detected license plate. Second, we propose a method called Character +Time-series Matching to recognize license plate characters from many +consequence frames. The proposed method archives high performance in the +UFPR-ALPR dataset which is \boldmath$96.7\%$ accuracy in real-time on RTX A5000 +GPU card. We also deploy the algorithm for the Vietnamese ALPR system. The +accuracy for license plate detection and character recognition are 0.881 and +0.979 $mAP^{test}$@.5 respectively. The source code is available at +https://github.com/chequanghuy/Character-Time-series-Matching.git + +
+
+
+
+
+ + ♻ ☆ NOPE-SAC: Neural One-Plane RANSAC for Sparse-View Planar 3D + Reconstruction + + +
+ This paper studies the challenging two-view 3D reconstruction in a rigorous +sparse-view configuration, which is suffering from insufficient correspondences +in the input image pairs for camera pose estimation. We present a novel Neural +One-PlanE RANSAC framework (termed NOPE-SAC in short) that exerts excellent +capability to learn one-plane pose hypotheses from 3D plane correspondences. +Building on the top of a siamese plane detection network, our NOPE-SAC first +generates putative plane correspondences with a coarse initial pose. It then +feeds the learned 3D plane parameters of correspondences into shared MLPs to +estimate the one-plane camera pose hypotheses, which are subsequently reweighed +in a RANSAC manner to obtain the final camera pose. Because the neural +one-plane pose minimizes the number of plane correspondences for adaptive pose +hypotheses generation, it enables stable pose voting and reliable pose +refinement in a few plane correspondences for the sparse-view inputs. In the +experiments, we demonstrate that our NOPE-SAC significantly improves the camera +pose estimation for the two-view inputs with severe viewpoint changes, setting +several new state-of-the-art performances on two challenging benchmarks, i.e., +MatterPort3D and ScanNet, for sparse-view 3D reconstruction. The source code is +released at https://github.com/IceTTTb/NopeSAC for reproducible research. + +
+
+ comment: Accepted to IEEE TPAMI; Code is available at + https://github.com/IceTTTb/NopeSAC +
+
+
+
+
+ + ♻ ☆ LRANet: Towards Accurate and Efficient Scene Text Detection with + Low-Rank Approximation Network + + +
+ Recently, regression-based methods, which predict parameterized text shapes +for text localization, have gained popularity in scene text detection. However, +the existing parameterized text shape methods still have limitations in +modeling arbitrary-shaped texts due to ignoring the utilization of +text-specific shape information. Moreover, the time consumption of the entire +pipeline has been largely overlooked, leading to a suboptimal overall inference +speed. To address these issues, we first propose a novel parameterized text +shape method based on low-rank approximation. Unlike other shape representation +methods that employ data-irrelevant parameterization, our approach utilizes +singular value decomposition and reconstructs the text shape using a few +eigenvectors learned from labeled text contours. By exploring the shape +correlation among different text contours, our method achieves consistency, +compactness, simplicity, and robustness in shape representation. Next, we +propose a dual assignment scheme for speed acceleration. It adopts a sparse +assignment branch to accelerate the inference speed, and meanwhile, provides +ample supervised signals for training through a dense assignment branch. +Building upon these designs, we implement an accurate and efficient +arbitrary-shaped text detector named LRANet. Extensive experiments are +conducted on several challenging benchmarks, demonstrating the superior +accuracy and efficiency of LRANet compared to state-of-the-art methods. Code +will be released soon. + +
+
+ comment: There were some errors in the experimental results of the first + version, such as inaccurate measurement of FPS and low F-meansure +
+
+
+
+
+ + ♻ ☆ A large scale multi-view RGBD visual affordance learning dataset + + +
+ The physical and textural attributes of objects have been widely studied for +recognition, detection and segmentation tasks in computer vision.~A number of +datasets, such as large scale ImageNet, have been proposed for feature learning +using data hungry deep neural networks and for hand-crafted feature extraction. +To intelligently interact with objects, robots and intelligent machines need +the ability to infer beyond the traditional physical/textural attributes, and +understand/learn visual cues, called visual affordances, for affordance +recognition, detection and segmentation. To date there is no publicly available +large dataset for visual affordance understanding and learning. In this paper, +we introduce a large scale multi-view RGBD visual affordance learning dataset, +a benchmark of 47210 RGBD images from 37 object categories, annotated with 15 +visual affordance categories. To the best of our knowledge, this is the first +ever and the largest multi-view RGBD visual affordance learning dataset. We +benchmark the proposed dataset for affordance segmentation and recognition +tasks using popular Vision Transformer and Convolutional Neural Networks. +Several state-of-the-art deep learning networks are evaluated each for +affordance recognition and segmentation tasks. Our experimental results +showcase the challenging nature of the dataset and present definite prospects +for new and robust affordance learning algorithms. The dataset is publicly +available at https://sites.google.com/view/afaqshah/dataset. + +
+
+
+
+
+ + ♻ ☆ Your Diffusion Model is Secretly a Zero-Shot Classifier ICCV 2023 + + +
+ The recent wave of large-scale text-to-image diffusion models has +dramatically increased our text-based image generation abilities. These models +can generate realistic images for a staggering variety of prompts and exhibit +impressive compositional generalization abilities. Almost all use cases thus +far have solely focused on sampling; however, diffusion models can also provide +conditional density estimates, which are useful for tasks beyond image +generation. In this paper, we show that the density estimates from large-scale +text-to-image diffusion models like Stable Diffusion can be leveraged to +perform zero-shot classification without any additional training. Our +generative approach to classification, which we call Diffusion Classifier, +attains strong results on a variety of benchmarks and outperforms alternative +methods of extracting knowledge from diffusion models. Although a gap remains +between generative and discriminative approaches on zero-shot recognition +tasks, our diffusion-based approach has significantly stronger multimodal +compositional reasoning ability than competing discriminative approaches. +Finally, we use Diffusion Classifier to extract standard classifiers from +class-conditional diffusion models trained on ImageNet. Our models achieve +strong classification performance using only weak augmentations and exhibit +qualitatively better "effective robustness" to distribution shift. Overall, our +results are a step toward using generative over discriminative models for +downstream tasks. Results and visualizations at +https://diffusion-classifier.github.io/ + +
+
+ comment: In ICCV 2023. Website at https://diffusion-classifier.github.io/ +
+
+
+
+
+ + ♻ ☆ Selection of contributing factors for predicting landslide + susceptibility using machine learning and deep learning models + + +
+ Landslides are a common natural disaster that can cause casualties, property +safety threats and economic losses. Therefore, it is important to understand or +predict the probability of landslide occurrence at potentially risky sites. A +commonly used means is to carry out a landslide susceptibility assessment based +on a landslide inventory and a set of landslide contributing factors. This can +be readily achieved using machine learning (ML) models such as logistic +regression (LR), support vector machine (SVM), random forest (RF), extreme +gradient boosting (Xgboost), or deep learning (DL) models such as convolutional +neural network (CNN) and long short time memory (LSTM). As the input data for +these models, landslide contributing factors have varying influences on +landslide occurrence. Therefore, it is logically feasible to select more +important contributing factors and eliminate less relevant ones, with the aim +of increasing the prediction accuracy of these models. However, selecting more +important factors is still a challenging task and there is no generally +accepted method. Furthermore, the effects of factor selection using various +methods on the prediction accuracy of ML and DL models are unclear. In this +study, the impact of the selection of contributing factors on the accuracy of +landslide susceptibility predictions using ML and DL models was investigated. +Four methods for selecting contributing factors were considered for all the +aforementioned ML and DL models, which included Information Gain Ratio (IGR), +Recursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least +Absolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization +(HHO). In addition, autoencoder-based factor selection methods for DL models +were also investigated. To assess their performances, an exhaustive approach +was adopted,... + +
+
+ comment: Stochastic Environmental Research and Risk Assessment +
+
+
+
+
+ + ♻ ☆ EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought + + +
+ Embodied AI is a crucial frontier in robotics, capable of planning and +executing action sequences for robots to accomplish long-horizon tasks in +physical environments. In this work, we introduce EmbodiedGPT, an end-to-end +multi-modal foundation model for embodied AI, empowering embodied agents with +multi-modal understanding and execution capabilities. To achieve this, we have +made the following efforts: (i) We craft a large-scale embodied planning +dataset, termed EgoCOT. The dataset consists of carefully selected videos from +the Ego4D dataset, along with corresponding high-quality language instructions. +Specifically, we generate a sequence of sub-goals with the "Chain of Thoughts" +mode for effective embodied planning. (ii) We introduce an efficient training +approach to EmbodiedGPT for high-quality plan generation, by adapting a 7B +large language model (LLM) to the EgoCOT dataset via prefix tuning. (iii) We +introduce a paradigm for extracting task-related features from LLM-generated +planning queries to form a closed loop between high-level planning and +low-level control. Extensive experiments show the effectiveness of EmbodiedGPT +on embodied tasks, including embodied planning, embodied control, visual +captioning, and visual question answering. Notably, EmbodiedGPT significantly +enhances the success rate of the embodied control task by extracting more +effective features. It has achieved a remarkable 1.6 times increase in success +rate on the Franka Kitchen benchmark and a 1.3 times increase on the Meta-World +benchmark, compared to the BLIP-2 baseline fine-tuned with the Ego4D dataset. + +
+
+
+
+
+ + ♻ ☆ Physically Grounded Vision-Language Models for Robotic Manipulation + + +
+ Recent advances in vision-language models (VLMs) have led to improved +performance on tasks such as visual question answering and image captioning. +Consequently, these models are now well-positioned to reason about the physical +world, particularly within domains such as robotic manipulation. However, +current VLMs are limited in their understanding of the physical concepts (e.g., +material, fragility) of common objects, which restricts their usefulness for +robotic manipulation tasks that involve interaction and physical reasoning +about such objects. To address this limitation, we propose PhysObjects, an +object-centric dataset of 39.6K crowd-sourced and 417K automated physical +concept annotations of common household objects. We demonstrate that +fine-tuning a VLM on PhysObjects improves its understanding of physical object +concepts, including generalization to held-out concepts, by capturing human +priors of these concepts from visual appearance. We incorporate this +physically-grounded VLM in an interactive framework with a large language +model-based robotic planner, and show improved planning performance on tasks +that require reasoning about physical object concepts, compared to baselines +that do not leverage physically-grounded VLMs. We additionally illustrate the +benefits of our physically-grounded VLM on a real robot, where it improves task +success rates. We release our dataset and provide further details and +visualizations of our results at https://iliad.stanford.edu/pg-vlm/. + +
+
+ comment: Updated generalization results on held-out concepts +
+
+
+
+
+ + ♻ ☆ Efficient Spatially Sparse Inference for Conditional GANs and Diffusion + Models NeurIPS 2022 + + +
+ During image editing, existing deep generative models tend to re-synthesize +the entire output from scratch, including the unedited regions. This leads to a +significant waste of computation, especially for minor editing operations. In +this work, we present Spatially Sparse Inference (SSI), a general-purpose +technique that selectively performs computation for edited regions and +accelerates various generative models, including both conditional GANs and +diffusion models. Our key observation is that users prone to gradually edit the +input image. This motivates us to cache and reuse the feature maps of the +original image. Given an edited image, we sparsely apply the convolutional +filters to the edited regions while reusing the cached features for the +unedited areas. Based on our algorithm, we further propose Sparse Incremental +Generative Engine (SIGE) to convert the computation reduction to latency +reduction on off-the-shelf hardware. With about $1\%$-area edits, SIGE +accelerates DDPM by $3.0\times$ on NVIDIA RTX 3090 and $4.6\times$ on Apple M1 +Pro GPU, Stable Diffusion by $7.2\times$ on 3090, and GauGAN by $5.6\times$ on +3090 and $5.2\times$ on M1 Pro GPU. Compared to our conference version, we +extend SIGE to accommodate attention layers and apply it to Stable Diffusion. +Additionally, we offer support for Apple M1 Pro GPU and include more results +with large and sequential edits. + +
+
+ comment: NeurIPS 2022 T-PAMI 2023 Website: https://www.cs.cmu.edu/~sige/ Code: + https://github.com/lmxyy/sige +
+
+
+
+
+ + ♻ ☆ P1AC: Revisiting Absolute Pose From a Single Affine Correspondence ICCV 2023 + + +
+ Affine correspondences have traditionally been used to improve feature +matching over wide baselines. While recent work has successfully used affine +correspondences to solve various relative camera pose estimation problems, less +attention has been given to their use in absolute pose estimation. We introduce +the first general solution to the problem of estimating the pose of a +calibrated camera given a single observation of an oriented point and an affine +correspondence. The advantage of our approach (P1AC) is that it requires only a +single correspondence, in comparison to the traditional point-based approach +(P3P), significantly reducing the combinatorics in robust estimation. P1AC +provides a general solution that removes restrictive assumptions made in prior +work and is applicable to large-scale image-based localization. We propose a +minimal solution to the P1AC problem and evaluate our novel solver on synthetic +data, showing its numerical stability and performance under various types of +noise. On standard image-based localization benchmarks we show that P1AC +achieves more accurate results than the widely used P3P algorithm. Code for our +method is available at https://github.com/jonathanventura/P1AC/ . + +
+
+ comment: ICCV 2023 (with corrections in eqs. 6 and 13) +
+
+
+
+
+ + ♻ ☆ Autocharacterization: Automated and Scalable Semiconductor Property + Estimation from High-throughput Experiments using Computer Vision + + +
+ High-throughput materials synthesis methods have risen in popularity due to +their potential to accelerate the design and discovery of novel functional +materials, such as solution-processed semiconductors. After synthesis, key +material properties must be measured and characterized to validate discovery +and provide feedback to optimization cycles. However, with the boom in +development of high-throughput synthesis tools that champion production rates +up to $10^4$ samples per hour with flexible form factors, most sample +characterization methods are either slow (conventional rates of $10^1$ samples +per hour, approximately 1000x slower) or rigid (e.g., designed for +standard-size microplates), resulting in a bottleneck that impedes the +materials-design process. To overcome this challenge, we propose a set of +automated material property characterization (autocharacterization) tools that +leverage the adaptive, parallelizable, and scalable nature of computer vision +to accelerate the throughput of characterization by 85x compared to the +non-automated workflow. We demonstrate a generalizable composition mapping tool +for high-throughput synthesized binary material systems as well as two scalable +autocharacterization algorithms that (1) autonomously compute the band gap of +200 unique compositions in 6 minutes and (2) autonomously compute the degree of +degradation in 200 unique compositions in 20 minutes, generating ultra-high +compositional resolution trends of band gap and stability. We demonstrate that +the developed band gap and degradation detection autocharacterization methods +achieve 98.5% accuracy and 96.9% accuracy, respectively, on the +FA$_{1-x}$MA$_{x}$PbI$_3$, $0\leq x \leq 1$ perovskite semiconductor system. + +
+
+ comment: Manuscript 18 pages; Supplemental 20 pages +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Résumé Parsing as Hierarchical Sequence Labeling: An Empirical Study RecSys + + +
+ Extracting information from r\'esum\'es is typically formulated as a +two-stage problem, where the document is first segmented into sections and then +each section is processed individually to extract the target entities. Instead, +we cast the whole problem as sequence labeling in two levels -- lines and +tokens -- and study model architectures for solving both tasks simultaneously. +We build high-quality r\'esum\'e parsing corpora in English, French, Chinese, +Spanish, German, Portuguese, and Swedish. Based on these corpora, we present +experimental results that demonstrate the effectiveness of the proposed models +for the information extraction task, outperforming approaches introduced in +previous work. We conduct an ablation study of the proposed architectures. We +also analyze both model performance and resource efficiency, and describe the +trade-offs for model deployment in the context of a production environment. + +
+
+ comment: RecSys in HR'23: The 3rd Workshop on Recommender Systems for Human + Resources, in conjunction with the 17th ACM Conference on Recommender + Systems, September 18--22, 2023, Singapore, Singapore +
+
+
+
+
+ + ☆ Modeling Dislocation Dynamics Data Using Semantic Web Technologies + + +
+ Research in the field of Materials Science and Engineering focuses on the +design, synthesis, properties, and performance of materials. An important class +of materials that is widely investigated are crystalline materials, including +metals and semiconductors. Crystalline material typically contains a distinct +type of defect called "dislocation". This defect significantly affects various +material properties, including strength, fracture toughness, and ductility. +Researchers have devoted a significant effort in recent years to understanding +dislocation behavior through experimental characterization techniques and +simulations, e.g., dislocation dynamics simulations. This paper presents how +data from dislocation dynamics simulations can be modeled using semantic web +technologies through annotating data with ontologies. We extend the already +existing Dislocation Ontology by adding missing concepts and aligning it with +two other domain-related ontologies (i.e., the Elementary Multi-perspective +Material Ontology and the Materials Design Ontology) allowing for representing +the dislocation simulation data efficiently. Moreover, we show a real-world use +case by representing the discrete dislocation dynamics data as a knowledge +graph (DisLocKG) that illustrates the relationship between them. We also +developed a SPARQL endpoint that brings extensive flexibility to query +DisLocKG. + +
+
+
+
+
+ + ☆ Multi-behavior Recommendation with SVD Graph Neural Networks + + +
+ Graph Neural Networks (GNNs) has been extensively employed in the field of +recommender systems, offering users personalized recommendations and yielding +remarkable outcomes. Recently, GNNs incorporating contrastive learning have +demonstrated promising performance in handling sparse data problem of +recommendation system. However, existing contrastive learning methods still +have limitations in addressing the cold-start problem and resisting noise +interference especially for multi-behavior recommendation. To mitigate the +aforementioned issues, the present research posits a GNNs based multi-behavior +recommendation model MB-SVD that utilizes Singular Value Decomposition (SVD) +graphs to enhance model performance. In particular, MB-SVD considers user +preferences under different behaviors, improving recommendation effectiveness +while better addressing the cold-start problem. Our model introduces an +innovative methodology, which subsume multi-behavior contrastive learning +paradigm to proficiently discern the intricate interconnections among +heterogeneous manifestations of user behavior and generates SVD graphs to +automate the distillation of crucial multi-behavior self-supervised information +for robust graph augmentation. Furthermore, the SVD based framework reduces the +embedding dimensions and computational load. Thorough experimentation showcases +the remarkable performance of our proposed MB-SVD approach in multi-behavior +recommendation endeavors across diverse real-world datasets. + +
+
+
+
+
+ + ☆ Towards the TopMost: A Topic Modeling System Toolkit + + +
+ Topic models have been proposed for decades with various applications and +recently refreshed by the neural variational inference. However, these topic +models adopt totally distinct dataset, implementation, and evaluation settings, +which hinders their quick utilization and fair comparisons. This greatly +hinders the research progress of topic models. To address these issues, in this +paper we propose a Topic Modeling System Toolkit (TopMost). Compared to +existing toolkits, TopMost stands out by covering a wider range of topic +modeling scenarios including complete lifecycles with dataset pre-processing, +model training, testing, and evaluations. The highly cohesive and decoupled +modular design of TopMost enables quick utilization, fair comparisons, and +flexible extensions of different topic models. This can facilitate the research +and applications of topic models. Our code, tutorials, and documentation are +available at https://github.com/bobxwu/topmost. + +
+
+
+
+
+ + ☆ ProMap: Datasets for Product Mapping in E-commerce + + +
+ The goal of product mapping is to decide, whether two listings from two +different e-shops describe the same products. Existing datasets of matching and +non-matching pairs of products, however, often suffer from incomplete product +information or contain only very distant non-matching products. Therefore, +while predictive models trained on these datasets achieve good results on them, +in practice, they are unusable as they cannot distinguish very similar but +non-matching pairs of products. This paper introduces two new datasets for +product mapping: ProMapCz consisting of 1,495 Czech product pairs and ProMapEn +consisting of 1,555 English product pairs of matching and non-matching products +manually scraped from two pairs of e-shops. The datasets contain both images +and textual descriptions of the products, including their specifications, +making them one of the most complete datasets for product mapping. +Additionally, the non-matching products were selected in two phases, creating +two types of non-matches -- close non-matches and medium non-matches. Even the +medium non-matches are pairs of products that are much more similar than +non-matches in other datasets -- for example, they still need to have the same +brand and similar name and price. After simple data preprocessing, several +machine learning algorithms were trained on these and two the other datasets to +demonstrate the complexity and completeness of ProMap datasets. ProMap datasets +are presented as a golden standard for further research of product mapping +filling the gaps in existing ones. + +
+
+
+
+
+ + ☆ An Image Dataset for Benchmarking Recommender Systems with Raw Pixels + + +
+ Recommender systems (RS) have achieved significant success by leveraging +explicit identification (ID) features. However, the full potential of content +features, especially the pure image pixel features, remains relatively +unexplored. The limited availability of large, diverse, and content-driven +image recommendation datasets has hindered the use of raw images as item +representations. In this regard, we present PixelRec, a massive image-centric +recommendation dataset that includes approximately 200 million user-image +interactions, 30 million users, and 400,000 high-quality cover images. By +providing direct access to raw image pixels, PixelRec enables recommendation +models to learn item representation directly from them. To demonstrate its +utility, we begin by presenting the results of several classical pure ID-based +baseline models, termed IDNet, trained on PixelRec. Then, to show the +effectiveness of the dataset's image features, we substitute the itemID +embeddings (from IDNet) with a powerful vision encoder that represents items +using their raw image pixels. This new model is dubbed PixelNet.Our findings +indicate that even in standard, non-cold start recommendation settings where +IDNet is recognized as highly effective, PixelNet can already perform equally +well or even better than IDNet. Moreover, PixelNet has several other notable +advantages over IDNet, such as being more effective in cold-start and +cross-domain recommendation scenarios. These results underscore the importance +of visual features in PixelRec. We believe that PixelRec can serve as a +critical resource and testing ground for research on recommendation models that +emphasize image pixel content. The dataset, code, and leaderboard will be +available at https://github.com/website-pixelrec/PixelRec. + +
+
+
+
+
+ + ☆ CONVERSER: Few-Shot Conversational Dense Retrieval with Synthetic Data + Generation SIGDIAL 2023 + + +
+ Conversational search provides a natural interface for information retrieval +(IR). Recent approaches have demonstrated promising results in applying dense +retrieval to conversational IR. However, training dense retrievers requires +large amounts of in-domain paired data. This hinders the development of +conversational dense retrievers, as abundant in-domain conversations are +expensive to collect. In this paper, we propose CONVERSER, a framework for +training conversational dense retrievers with at most 6 examples of in-domain +dialogues. Specifically, we utilize the in-context learning capability of large +language models to generate conversational queries given a passage in the +retrieval corpus. Experimental results on conversational retrieval benchmarks +OR-QuAC and TREC CAsT 19 show that the proposed CONVERSER achieves comparable +performance to fully-supervised models, demonstrating the effectiveness of our +proposed framework in few-shot conversational dense retrieval. All source code +and generated datasets are available at https://github.com/MiuLab/CONVERSER + +
+
+ comment: Accepted to SIGDIAL 2023 +
+
+
+
+
+ + ♻ ☆ Cost-optimal Seeding Strategy During a Botanical Pandemic in + Domesticated Fields + + +
+ Context: Botanical pandemics cause enormous economic damage and food shortage +around the globe. However, since botanical pandemics are here to stay in the +short-medium term, domesticated field owners can strategically seed their +fields to optimize each session's economic profit. Objective: Given the +pathogen's epidemiological properties, we aim to find an economically optimal +grid-based seeding strategy for field owners and policymakers. Methods: We +propose a novel epidemiological-economic mathematical model that describes the +economic profit from a field of plants during a botanical pandemic. We describe +the epidemiological dynamics using a spatio-temporal extended +Susceptible-Infected-Recovered epidemiological model with a non-linear output +epidemiological model. Results and Conclusions: We provide an algorithm to +obtain an optimal grid-formed seeding strategy to maximize economic profit, +given field and pathogen properties. In addition, we implement the proposed +model in realistic settings, analyzing the sensitivity of the economic profit +as a function of several epidemiological and economic properties. We show that +the recovery and basic infection rates have a similar economic influence. +Unintuitively, we show that in the context of a botanic pandemic, a larger farm +does not promise higher economic profit. Significance: Our results demonstrate +a significant benefit of using the proposed seeding strategy and shed more +light on the dynamics of the botanical pandemic in domesticated fields. + +
+
+
+
+
+ + ♻ ☆ AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity + Recognition and Linking + + +
+ This paper presents a novel approach to address the Entity Recognition and +Linking Challenge at NLPCC 2015. The task involves extracting named entity +mentions from short search queries and linking them to entities within a +reference Chinese knowledge base. To tackle this problem, we first expand the +existing knowledge base and utilize external knowledge to identify candidate +entities, thereby improving the recall rate. Next, we extract features from the +candidate entities and utilize Support Vector Regression and Multiple Additive +Regression Tree as scoring functions to filter the results. Additionally, we +apply rules to further refine the results and enhance precision. Our method is +computationally efficient and achieves an F1 score of 0.535. + +
+
+
+
+
+
+
+
+ + Machine Learning 115 + +
+
+
+ + ☆ Sight Beyond Text: Multi-Modal Training Enhances LLMs in Truthfulness + and Ethics + + +
+ Multi-modal large language models (MLLMs) are trained based on large language +models (LLM), with an enhanced capability to comprehend multi-modal inputs and +generate textual responses. While they excel in multi-modal tasks, the pure NLP +abilities of MLLMs are often underestimated and left untested. In this study, +we get out of the box and unveil an intriguing characteristic of MLLMs -- our +preliminary results suggest that visual instruction tuning, a prevailing +strategy for transitioning LLMs into MLLMs, unexpectedly and interestingly +helps models attain both improved truthfulness and ethical alignment in the +pure NLP context. For example, a visual-instruction-tuned LLaMA2 7B model +surpasses the performance of the LLaMA2-chat 7B model, fine-tuned with over one +million human annotations, on TruthfulQA-mc and Ethics benchmarks. Further +analysis reveals that the improved alignment can be attributed to the superior +instruction quality inherent to visual-text data. In releasing our code at +github.com/UCSC-VLAA/Sight-Beyond-Text, we aspire to foster further exploration +into the intrinsic value of visual-text synergies and, in a broader scope, +multi-modal interactions in alignment research. + +
+
+
+
+
+ + ☆ PILOT: A Pre-Trained Model-Based Continual Learning Toolbox + + +
+ While traditional machine learning can effectively tackle a wide range of +problems, it primarily operates within a closed-world setting, which presents +limitations when dealing with streaming data. As a solution, incremental +learning emerges to address real-world scenarios involving new data's arrival. +Recently, pre-training has made significant advancements and garnered the +attention of numerous researchers. The strong performance of these pre-trained +models (PTMs) presents a promising avenue for developing continual learning +algorithms that can effectively adapt to real-world scenarios. Consequently, +exploring the utilization of PTMs in incremental learning has become essential. +This paper introduces a pre-trained model-based continual learning toolbox +known as PILOT. On the one hand, PILOT implements some state-of-the-art +class-incremental learning algorithms based on pre-trained models, such as L2P, +DualPrompt, and CODA-Prompt. On the other hand, PILOT also fits typical +class-incremental learning algorithms (e.g., DER, FOSTER, and MEMO) within the +context of pre-trained models to evaluate their effectiveness. + +
+
+ comment: Code is available at https://github.com/sun-hailong/LAMDA-PILOT +
+
+
+
+
+ + ☆ Weakly-Supervised Multi-Task Learning for Audio-Visual Speaker + Verification + + +
+ In this paper, we present a methodology for achieving robust multimodal +person representations optimized for open-set audio-visual speaker +verification. Distance Metric Learning (DML) approaches have typically +dominated this problem space, owing to strong performance on new and unseen +classes. In our work, we explored multitask learning techniques to further +boost performance of the DML approach and show that an auxiliary task with weak +labels can increase the compactness of the learned speaker representation. We +also extend the Generalized end-to-end loss (GE2E) to multimodal inputs and +demonstrate that it can achieve competitive performance in an audio-visual +space. Finally, we introduce a non-synchronous audio-visual sampling random +strategy during training time that has shown to improve generalization. Our +network achieves state of the art performance for speaker verification, +reporting 0.244%, 0.252%, 0.441% Equal Error Rate (EER) on the three official +trial lists of VoxCeleb1-O/E/H, which is to our knowledge, the best published +results on VoxCeleb1-E and VoxCeleb1-H. + +
+
+
+
+
+ + ☆ Contrastive Deep Encoding Enables Uncertainty-aware + Machine-learning-assisted Histopathology + + +
+ Deep neural network models can learn clinically relevant features from +millions of histopathology images. However generating high-quality annotations +to train such models for each hospital, each cancer type, and each diagnostic +task is prohibitively laborious. On the other hand, terabytes of training data +-- while lacking reliable annotations -- are readily available in the public +domain in some cases. In this work, we explore how these large datasets can be +consciously utilized to pre-train deep networks to encode informative +representations. We then fine-tune our pre-trained models on a fraction of +annotated training data to perform specific downstream tasks. We show that our +approach can reach the state-of-the-art (SOTA) for patch-level classification +with only 1-10% randomly selected annotations compared to other SOTA +approaches. Moreover, we propose an uncertainty-aware loss function, to +quantify the model confidence during inference. Quantified uncertainty helps +experts select the best instances to label for further training. Our +uncertainty-aware labeling reaches the SOTA with significantly fewer +annotations compared to random labeling. Last, we demonstrate how our +pre-trained encoders can surpass current SOTA for whole-slide image +classification with weak supervision. Our work lays the foundation for data and +task-agnostic pre-trained deep networks with quantified uncertainty. + +
+
+ comment: 18 pages, 8 figures +
+
+
+
+
+ + ☆ Data Augmentation via Subgroup Mixup for Improving Fairness + + +
+ In this work, we propose data augmentation via pairwise mixup across +subgroups to improve group fairness. Many real-world applications of machine +learning systems exhibit biases across certain groups due to +under-representation or training data that reflects societal biases. Inspired +by the successes of mixup for improving classification performance, we develop +a pairwise mixup scheme to augment training data and encourage fair and +accurate decision boundaries for all subgroups. Data augmentation for group +fairness allows us to add new samples of underrepresented groups to balance +subpopulations. Furthermore, our method allows us to use the generalization +ability of mixup to improve both fairness and accuracy. We compare our proposed +mixup to existing data augmentation and bias mitigation approaches on both +synthetic simulations and real-world benchmark fair classification data, +demonstrating that we are able to achieve fair outcomes with robust if not +improved accuracy. + +
+
+ comment: 5 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ Characterizing Speed Performance of Multi-Agent Reinforcement Learning + + +
+ Multi-Agent Reinforcement Learning (MARL) has achieved significant success in +large-scale AI systems and big-data applications such as smart grids, +surveillance, etc. Existing advancements in MARL algorithms focus on improving +the rewards obtained by introducing various mechanisms for inter-agent +cooperation. However, these optimizations are usually compute- and +memory-intensive, thus leading to suboptimal speed performance in end-to-end +training time. In this work, we analyze the speed performance (i.e., +latency-bounded throughput) as the key metric in MARL implementations. +Specifically, we first introduce a taxonomy of MARL algorithms from an +acceleration perspective categorized by (1) training scheme and (2) +communication method. Using our taxonomy, we identify three state-of-the-art +MARL algorithms - Multi-Agent Deep Deterministic Policy Gradient (MADDPG), +Target-oriented Multi-agent Communication and Cooperation (ToM2C), and +Networked Multi-Agent RL (NeurComm) - as target benchmark algorithms, and +provide a systematic analysis of their performance bottlenecks on a homogeneous +multi-core CPU platform. We justify the need for MARL latency-bounded +throughput to be a key performance metric in future literature while also +addressing opportunities for parallelization and acceleration. + +
+
+
+
+
+ + ☆ Mitigating Group Bias in Federated Learning for Heterogeneous Devices + + +
+ Federated Learning is emerging as a privacy-preserving model training +approach in distributed edge applications. As such, most edge deployments are +heterogeneous in nature i.e., their sensing capabilities and environments vary +across deployments. This edge heterogeneity violates the independence and +identical distribution (IID) property of local data across clients and produces +biased global models i.e. models that contribute to unfair decision-making and +discrimination against a particular community or a group. Existing bias +mitigation techniques only focus on bias generated from label heterogeneity in +non-IID data without accounting for domain variations due to feature +heterogeneity and do not address global group-fairness property. + Our work proposes a group-fair FL framework that minimizes group-bias while +preserving privacy and without resource utilization overhead. Our main idea is +to leverage average conditional probabilities to compute a cross-domain group +\textit{importance weights} derived from heterogeneous training data to +optimize the performance of the worst-performing group using a modified +multiplicative weights update method. Additionally, we propose regularization +techniques to minimize the difference between the worst and best-performing +groups while making sure through our thresholding mechanism to strike a balance +between bias reduction and group performance degradation. Our evaluation of +human emotion recognition and image classification benchmarks assesses the fair +decision-making of our framework in real-world heterogeneous settings. + +
+
+
+
+
+ + ☆ The Boundaries of Verifiable Accuracy, Robustness, and Generalisation in + Deep Learning + + +
+ In this work, we assess the theoretical limitations of determining guaranteed +stability and accuracy of neural networks in classification tasks. We consider +classical distribution-agnostic framework and algorithms minimising empirical +risks and potentially subjected to some weights regularisation. We show that +there is a large family of tasks for which computing and verifying ideal stable +and accurate neural networks in the above settings is extremely challenging, if +at all possible, even when such ideal solutions exist within the given class of +neural architectures. + +
+
+
+
+
+ + ☆ Deep Quantum Graph Dreaming: Deciphering Neural Network Insights into + Quantum Experiments + + +
+ Despite their promise to facilitate new scientific discoveries, the +opaqueness of neural networks presents a challenge in interpreting the logic +behind their findings. Here, we use a eXplainable-AI (XAI) technique called +$inception$ or $deep$ $dreaming$, which has been invented in machine learning +for computer vision. We use this techniques to explore what neural networks +learn about quantum optics experiments. Our story begins by training a deep +neural networks on the properties of quantum systems. Once trained, we "invert" +the neural network -- effectively asking how it imagines a quantum system with +a specific property, and how it would continuously modify the quantum system to +change a property. We find that the network can shift the initial distribution +of properties of the quantum system, and we can conceptualize the learned +strategies of the neural network. Interestingly, we find that, in the first +layers, the neural network identifies simple properties, while in the deeper +ones, it can identify complex quantum structures and even quantum entanglement. +This is in reminiscence of long-understood properties known in computer vision, +which we now identify in a complex natural science task. Our approach could be +useful in a more interpretable way to develop new advanced AI-based scientific +discovery techniques in quantum physics. + +
+
+ comment: 10 pages, 6 figures. Comments welcome! +
+
+
+
+
+ + ☆ An Extreme Learning Machine-Based Method for Computational PDEs in + Higher Dimensions + + +
+ We present two effective methods for solving high-dimensional partial +differential equations (PDE) based on randomized neural networks. Motivated by +the universal approximation property of this type of networks, both methods +extend the extreme learning machine (ELM) approach from low to high dimensions. +With the first method the unknown solution field in $d$ dimensions is +represented by a randomized feed-forward neural network, in which the +hidden-layer parameters are randomly assigned and fixed while the output-layer +parameters are trained. The PDE and the boundary/initial conditions, as well as +the continuity conditions (for the local variant of the method), are enforced +on a set of random interior/boundary collocation points. The resultant linear +or nonlinear algebraic system, through its least squares solution, provides the +trained values for the network parameters. With the second method the +high-dimensional PDE problem is reformulated through a constrained expression +based on an Approximate variant of the Theory of Functional Connections +(A-TFC), which avoids the exponential growth in the number of terms of TFC as +the dimension increases. The free field function in the A-TFC constrained +expression is represented by a randomized neural network and is trained by a +procedure analogous to the first method. We present ample numerical simulations +for a number of high-dimensional linear/nonlinear stationary/dynamic PDEs to +demonstrate their performance. These methods can produce accurate solutions to +high-dimensional PDEs, in particular with their errors reaching levels not far +from the machine accuracy for relatively lower dimensions. Compared with the +physics-informed neural network (PINN) method, the current method is both +cost-effective and more accurate for high-dimensional PDEs. + +
+
+ comment: 38 pages, 17 tables, 25 figures +
+
+
+
+
+ + ☆ Optimal transport distances for directed, weighted graphs: a case study + with cell-cell communication networks + + +
+ Comparing graphs of optimal transport has recently gained significant +attention, as the distances induced by optimal transport provide both a +principled metric between graphs as well as an interpretable description of the +associated changes between graphs in terms of a transport plan. As the lack of +symmetry introduces challenges in the typically considered formulations, +optimal transport distances for graphs have mostly been developed for +undirected graphs. Here, we propose two distance measures to compare directed +graphs based on variants of optimal transport: (i) an earth movers distance +(Wasserstein) and (ii) a Gromov-Wasserstein (GW) distance. We evaluate these +two distances and discuss their relative performance for both simulated graph +data and real-world directed cell-cell communication graphs, inferred from +single-cell RNA-seq data. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ Unsupervised Contrast-Consistent Ranking with Language Models + + +
+ Language models contain ranking-based knowledge and are powerful solvers of +in-context ranking tasks. For instance, they may have parametric knowledge +about the ordering of countries by size or may be able to rank reviews by +sentiment. Recent work focuses on pairwise, pointwise, and listwise prompting +techniques to elicit a language model's ranking knowledge. However, we find +that even with careful calibration and constrained decoding, prompting-based +techniques may not always be self-consistent in the rankings they produce. This +motivates us to explore an alternative approach that is inspired by an +unsupervised probing method called Contrast-Consistent Search (CCS). The idea +is to train a probing model guided by a logical constraint: a model's +representation of a statement and its negation must be mapped to contrastive +true-false poles consistently across multiple statements. We hypothesize that +similar constraints apply to ranking tasks where all items are related via +consistent pairwise or listwise comparisons. To this end, we extend the binary +CCS method to Contrast-Consistent Ranking (CCR) by adapting existing ranking +methods such as the Max-Margin Loss, Triplet Loss, and Ordinal Regression +objective. Our results confirm that, for the same language model, CCR probing +outperforms prompting and even performs on a par with prompting much larger +language models. + +
+
+
+
+
+ + ☆ MASTERKEY: Practical Backdoor Attack Against Speaker Verification + Systems + + +
+ Speaker Verification (SV) is widely deployed in mobile systems to +authenticate legitimate users by using their voice traits. In this work, we +propose a backdoor attack MASTERKEY, to compromise the SV models. Different +from previous attacks, we focus on a real-world practical setting where the +attacker possesses no knowledge of the intended victim. To design MASTERKEY, we +investigate the limitation of existing poisoning attacks against unseen +targets. Then, we optimize a universal backdoor that is capable of attacking +arbitrary targets. Next, we embed the speaker's characteristics and semantics +information into the backdoor, making it imperceptible. Finally, we estimate +the channel distortion and integrate it into the backdoor. We validate our +attack on 6 popular SV models. Specifically, we poison a total of 53 models and +use our trigger to attack 16,430 enrolled speakers, composed of 310 target +speakers enrolled in 53 poisoned models. Our attack achieves 100% attack +success rate with a 15% poison rate. By decreasing the poison rate to 3%, the +attack success rate remains around 50%. We validate our attack in 3 real-world +scenarios and successfully demonstrate the attack through both over-the-air and +over-the-telephony-line scenarios. + +
+
+ comment: Accepted by Mobicom 2023 +
+
+
+
+
+ + ☆ Auto-Regressive Next-Token Predictors are Universal Learners + + +
+ Large language models display remarkable capabilities in logical and +mathematical reasoning, allowing them to solve complex tasks. Interestingly, +these abilities emerge in networks trained on the simple task of next-token +prediction. In this work, we present a theoretical framework for studying +auto-regressive next-token predictors. We demonstrate that even simple models +such as linear next-token predictors, trained on Chain-of-Thought (CoT) data, +can approximate any function efficiently computed by a Turing machine. We +introduce a new complexity measure -- length complexity -- which measures the +number of intermediate tokens in a CoT sequence required to approximate some +target function, and analyze the interplay between length complexity and other +notions of complexity. Finally, we show experimentally that simple next-token +predictors, such as linear networks and shallow Multi-Layer Perceptrons (MLPs), +display non-trivial performance on text generation and arithmetic tasks. Our +results demonstrate that the power of language models can be attributed, to a +great extent, to the auto-regressive next-token training scheme, and not +necessarily to a particular choice of architecture. + +
+
+
+
+
+ + ☆ DNNShifter: An Efficient DNN Pruning System for Edge Computing + + +
+ Deep neural networks (DNNs) underpin many machine learning applications. +Production quality DNN models achieve high inference accuracy by training +millions of DNN parameters which has a significant resource footprint. This +presents a challenge for resources operating at the extreme edge of the +network, such as mobile and embedded devices that have limited computational +and memory resources. To address this, models are pruned to create lightweight, +more suitable variants for these devices. Existing pruning methods are unable +to provide similar quality models compared to their unpruned counterparts +without significant time costs and overheads or are limited to offline use +cases. Our work rapidly derives suitable model variants while maintaining the +accuracy of the original model. The model variants can be swapped quickly when +system and network conditions change to match workload demand. This paper +presents DNNShifter, an end-to-end DNN training, spatial pruning, and model +switching system that addresses the challenges mentioned above. At the heart of +DNNShifter is a novel methodology that prunes sparse models using structured +pruning. The pruned model variants generated by DNNShifter are smaller in size +and thus faster than dense and sparse model predecessors, making them suitable +for inference at the edge while retaining near similar accuracy as of the +original dense model. DNNShifter generates a portfolio of model variants that +can be swiftly interchanged depending on operational conditions. DNNShifter +produces pruned model variants up to 93x faster than conventional training +methods. Compared to sparse models, the pruned model variants are up to 5.14x +smaller and have a 1.67x inference latency speedup, with no compromise to +sparse model accuracy. In addition, DNNShifter has up to 11.9x lower overhead +for switching models and up to 3.8x lower memory utilisation than existing +approaches. + +
+
+ comment: 14 pages, 7 figures, 5 tables +
+
+
+
+
+ + ☆ Setting the Right Expectations: Algorithmic Recourse Over Time + + +
+ Algorithmic systems are often called upon to assist in high-stakes decision +making. In light of this, algorithmic recourse, the principle wherein +individuals should be able to take action against an undesirable outcome made +by an algorithmic system, is receiving growing attention. The bulk of the +literature on algorithmic recourse to-date focuses primarily on how to provide +recourse to a single individual, overlooking a critical element: the effects of +a continuously changing context. Disregarding these effects on recourse is a +significant oversight, since, in almost all cases, recourse consists of an +individual making a first, unfavorable attempt, and then being given an +opportunity to make one or several attempts at a later date - when the context +might have changed. This can create false expectations, as initial recourse +recommendations may become less reliable over time due to model drift and +competition for access to the favorable outcome between individuals. + In this work we propose an agent-based simulation framework for studying the +effects of a continuously changing environment on algorithmic recourse. In +particular, we identify two main effects that can alter the reliability of +recourse for individuals represented by the agents: (1) competition with other +agents acting upon recourse, and (2) competition with new agents entering the +environment. Our findings highlight that only a small set of specific +parameterizations result in algorithmic recourse that is reliable for agents +over time. Consequently, we argue that substantial additional work is needed to +understand recourse reliability over time, and to develop recourse methods that +reward agents' effort. + +
+
+
+
+
+ + ☆ Implicit Neural Multiple Description for DNA-based data storage + + +
+ DNA exhibits remarkable potential as a data storage solution due to its +impressive storage density and long-term stability, stemming from its inherent +biomolecular structure. However, developing this novel medium comes with its +own set of challenges, particularly in addressing errors arising from storage +and biological manipulations. These challenges are further conditioned by the +structural constraints of DNA sequences and cost considerations. In response to +these limitations, we have pioneered a novel compression scheme and a +cutting-edge Multiple Description Coding (MDC) technique utilizing neural +networks for DNA data storage. Our MDC method introduces an innovative approach +to encoding data into DNA, specifically designed to withstand errors +effectively. Notably, our new compression scheme overperforms classic image +compression methods for DNA-data storage. Furthermore, our approach exhibits +superiority over conventional MDC methods reliant on auto-encoders. Its +distinctive strengths lie in its ability to bypass the need for extensive model +training and its enhanced adaptability for fine-tuning redundancy levels. +Experimental results demonstrate that our solution competes favorably with the +latest DNA data storage methods in the field, offering superior compression +rates and robust noise resilience. + +
+
+ comment: Xavier Pic and Trung Hieu Le are both equal contributors and primary + authors +
+
+
+
+
+ + ☆ Effect of hyperparameters on variable selection in random forests + + +
+ Random forests (RFs) are well suited for prediction modeling and variable +selection in high-dimensional omics studies. The effect of hyperparameters of +the RF algorithm on prediction performance and variable importance estimation +have previously been investigated. However, how hyperparameters impact RF-based +variable selection remains unclear. We evaluate the effects on the Vita and the +Boruta variable selection procedures based on two simulation studies utilizing +theoretical distributions and empirical gene expression data. We assess the +ability of the procedures to select important variables (sensitivity) while +controlling the false discovery rate (FDR). Our results show that the +proportion of splitting candidate variables (mtry.prop) and the sample fraction +(sample.fraction) for the training dataset influence the selection procedures +more than the drawing strategy of the training datasets and the minimal +terminal node size. A suitable setting of the RF hyperparameters depends on the +correlation structure in the data. For weakly correlated predictor variables, +the default value of mtry is optimal, but smaller values of sample.fraction +result in larger sensitivity. In contrast, the difference in sensitivity of the +optimal compared to the default value of sample.fraction is negligible for +strongly correlated predictor variables, whereas smaller values than the +default are better in the other settings. In conclusion, the default values of +the hyperparameters will not always be suitable for identifying important +variables. Thus, adequate values differ depending on whether the aim of the +study is optimizing prediction performance or variable selection. + +
+
+ comment: 18 pages, 2 figures + 2 figures in appendix, 3 tables +
+
+
+
+
+ + ☆ Collectionless Artificial Intelligence + + +
+ By and large, the professional handling of huge data collections is regarded +as a fundamental ingredient of the progress of machine learning and of its +spectacular results in related disciplines, with a growing agreement on risks +connected to the centralization of such data collections. This paper sustains +the position that the time has come for thinking of new learning protocols +where machines conquer cognitive skills in a truly human-like context centered +on environmental interactions. This comes with specific restrictions on the +learning protocol according to the collectionless principle, which states that, +at each time instant, data acquired from the environment is processed with the +purpose of contributing to update the current internal representation of the +environment, and that the agent is not given the privilege of recording the +temporal stream. Basically, there is neither permission to store the temporal +information coming from the sensors, thus promoting the development of +self-organized memorization skills at a more abstract level, instead of relying +on bare storage to simulate learning dynamics that are typical of offline +learning algorithms. This purposely extreme position is intended to stimulate +the development of machines that learn to dynamically organize the information +by following human-based schemes. The proposition of this challenge suggests +developing new foundations on computational processes of learning and reasoning +that might open the doors to a truly orthogonal competitive track on AI +technologies that avoid data accumulation by design, thus offering a framework +which is better suited concerning privacy issues, control and customizability. +Finally, pushing towards massively distributed computation, the collectionless +approach to AI will likely reduce the concentration of power in companies and +governments, thus better facing geopolitical issues. + +
+
+
+
+
+ + ☆ Modeling Dislocation Dynamics Data Using Semantic Web Technologies + + +
+ Research in the field of Materials Science and Engineering focuses on the +design, synthesis, properties, and performance of materials. An important class +of materials that is widely investigated are crystalline materials, including +metals and semiconductors. Crystalline material typically contains a distinct +type of defect called "dislocation". This defect significantly affects various +material properties, including strength, fracture toughness, and ductility. +Researchers have devoted a significant effort in recent years to understanding +dislocation behavior through experimental characterization techniques and +simulations, e.g., dislocation dynamics simulations. This paper presents how +data from dislocation dynamics simulations can be modeled using semantic web +technologies through annotating data with ontologies. We extend the already +existing Dislocation Ontology by adding missing concepts and aligning it with +two other domain-related ontologies (i.e., the Elementary Multi-perspective +Material Ontology and the Materials Design Ontology) allowing for representing +the dislocation simulation data efficiently. Moreover, we show a real-world use +case by representing the discrete dislocation dynamics data as a knowledge +graph (DisLocKG) that illustrates the relationship between them. We also +developed a SPARQL endpoint that brings extensive flexibility to query +DisLocKG. + +
+
+
+
+
+ + ☆ Investigating the Impact of Action Representations in Policy Gradient + Algorithms ICRA 2023 + + +
+ Reinforcement learning~(RL) is a versatile framework for learning to solve +complex real-world tasks. However, influences on the learning performance of RL +algorithms are often poorly understood in practice. We discuss different +analysis techniques and assess their effectiveness for investigating the impact +of action representations in RL. Our experiments demonstrate that the action +representation can significantly influence the learning performance on popular +RL benchmark tasks. The analysis results indicate that some of the performance +differences can be attributed to changes in the complexity of the optimization +landscape. Finally, we discuss open challenges of analysis techniques for RL +algorithms. + +
+
+ comment: Published at the Workshop on effective Representations, Abstractions, + and Priors for Robot Learning (RAP4Robots) at ICRA 2023 +
+
+
+
+
+ + ☆ Continual Learning with Dirichlet Generative-based Rehearsal + + +
+ Recent advancements in data-driven task-oriented dialogue systems (ToDs) +struggle with incremental learning due to computational constraints and +time-consuming issues. Continual Learning (CL) attempts to solve this by +avoiding intensive pre-training, but it faces the problem of catastrophic +forgetting (CF). While generative-based rehearsal CL methods have made +significant strides, generating pseudo samples that accurately reflect the +underlying task-specific distribution is still a challenge. In this paper, we +present Dirichlet Continual Learning (DCL), a novel generative-based rehearsal +strategy for CL. Unlike the traditionally used Gaussian latent variable in the +Conditional Variational Autoencoder (CVAE), DCL leverages the flexibility and +versatility of the Dirichlet distribution to model the latent prior variable. +This enables it to efficiently capture sentence-level features of previous +tasks and effectively guide the generation of pseudo samples. In addition, we +introduce Jensen-Shannon Knowledge Distillation (JSKD), a robust logit-based +knowledge distillation method that enhances knowledge transfer during pseudo +sample generation. Our experiments confirm the efficacy of our approach in both +intent detection and slot-filling tasks, outperforming state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Towards the TopMost: A Topic Modeling System Toolkit + + +
+ Topic models have been proposed for decades with various applications and +recently refreshed by the neural variational inference. However, these topic +models adopt totally distinct dataset, implementation, and evaluation settings, +which hinders their quick utilization and fair comparisons. This greatly +hinders the research progress of topic models. To address these issues, in this +paper we propose a Topic Modeling System Toolkit (TopMost). Compared to +existing toolkits, TopMost stands out by covering a wider range of topic +modeling scenarios including complete lifecycles with dataset pre-processing, +model training, testing, and evaluations. The highly cohesive and decoupled +modular design of TopMost enables quick utilization, fair comparisons, and +flexible extensions of different topic models. This can facilitate the research +and applications of topic models. Our code, tutorials, and documentation are +available at https://github.com/bobxwu/topmost. + +
+
+
+
+
+ + ☆ Domain-Aware Augmentations for Unsupervised Online General Continual + Learning BMVC'23 + + +
+ Continual Learning has been challenging, especially when dealing with +unsupervised scenarios such as Unsupervised Online General Continual Learning +(UOGCL), where the learning agent has no prior knowledge of class boundaries or +task change information. While previous research has focused on reducing +forgetting in supervised setups, recent studies have shown that self-supervised +learners are more resilient to forgetting. This paper proposes a novel approach +that enhances memory usage for contrastive learning in UOGCL by defining and +using stream-dependent data augmentations together with some implementation +tricks. Our proposed method is simple yet effective, achieves state-of-the-art +results compared to other unsupervised approaches in all considered setups, and +reduces the gap between supervised and unsupervised continual learning. Our +domain-aware augmentation procedure can be adapted to other replay-based +methods, making it a promising strategy for continual learning. + +
+
+ comment: Accepted to BMVC'23 +
+
+
+
+
+ + ☆ MagiCapture: High-Resolution Multi-Concept Portrait Customization + + +
+ Large-scale text-to-image models including Stable Diffusion are capable of +generating high-fidelity photorealistic portrait images. There is an active +research area dedicated to personalizing these models, aiming to synthesize +specific subjects or styles using provided sets of reference images. However, +despite the plausible results from these personalization methods, they tend to +produce images that often fall short of realism and are not yet on a +commercially viable level. This is particularly noticeable in portrait image +generation, where any unnatural artifact in human faces is easily discernible +due to our inherent human bias. To address this, we introduce MagiCapture, a +personalization method for integrating subject and style concepts to generate +high-resolution portrait images using just a few subject and style references. +For instance, given a handful of random selfies, our fine-tuned model can +generate high-quality portrait images in specific styles, such as passport or +profile photos. The main challenge with this task is the absence of ground +truth for the composed concepts, leading to a reduction in the quality of the +final output and an identity shift of the source subject. To address these +issues, we present a novel Attention Refocusing loss coupled with auxiliary +priors, both of which facilitate robust learning within this weakly supervised +learning setting. Our pipeline also includes additional post-processing steps +to ensure the creation of highly realistic outputs. MagiCapture outperforms +other baselines in both quantitative and qualitative evaluations and can also +be generalized to other non-human objects. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ☆ Keep It SimPool: Who Said Supervised Transformers Suffer from Attention + Deficit? ICCV 2023 + + +
+ Convolutional networks and vision transformers have different forms of +pairwise interactions, pooling across layers and pooling at the end of the +network. Does the latter really need to be different? As a by-product of +pooling, vision transformers provide spatial attention for free, but this is +most often of low quality unless self-supervised, which is not well studied. Is +supervision really the problem? + In this work, we develop a generic pooling framework and then we formulate a +number of existing methods as instantiations. By discussing the properties of +each group of methods, we derive SimPool, a simple attention-based pooling +mechanism as a replacement of the default one for both convolutional and +transformer encoders. We find that, whether supervised or self-supervised, this +improves performance on pre-training and downstream tasks and provides +attention maps delineating object boundaries in all cases. One could thus call +SimPool universal. To our knowledge, we are the first to obtain attention maps +in supervised transformers of at least as good quality as self-supervised, +without explicit losses or modifying the architecture. Code at: +https://github.com/billpsomas/simpool. + +
+
+ comment: ICCV 2023. Code and models: https://github.com/billpsomas/simpool +
+
+
+
+
+ + ☆ ProMap: Datasets for Product Mapping in E-commerce + + +
+ The goal of product mapping is to decide, whether two listings from two +different e-shops describe the same products. Existing datasets of matching and +non-matching pairs of products, however, often suffer from incomplete product +information or contain only very distant non-matching products. Therefore, +while predictive models trained on these datasets achieve good results on them, +in practice, they are unusable as they cannot distinguish very similar but +non-matching pairs of products. This paper introduces two new datasets for +product mapping: ProMapCz consisting of 1,495 Czech product pairs and ProMapEn +consisting of 1,555 English product pairs of matching and non-matching products +manually scraped from two pairs of e-shops. The datasets contain both images +and textual descriptions of the products, including their specifications, +making them one of the most complete datasets for product mapping. +Additionally, the non-matching products were selected in two phases, creating +two types of non-matches -- close non-matches and medium non-matches. Even the +medium non-matches are pairs of products that are much more similar than +non-matches in other datasets -- for example, they still need to have the same +brand and similar name and price. After simple data preprocessing, several +machine learning algorithms were trained on these and two the other datasets to +demonstrate the complexity and completeness of ProMap datasets. ProMap datasets +are presented as a golden standard for further research of product mapping +filling the gaps in existing ones. + +
+
+
+
+
+ + ☆ Dynamic control of self-assembly of quasicrystalline structures through + reinforcement learning + + +
+ We propose reinforcement learning to control the dynamical self-assembly of +the dodecagonal quasicrystal (DDQC) from patchy particles. The patchy particles +have anisotropic interactions with other particles and form DDQC. However, +their structures at steady states are significantly influenced by the kinetic +pathways of their structural formation. We estimate the best policy of +temperature control trained by the Q-learning method and demonstrate that we +can generate DDQC with few defects using the estimated policy. The temperature +schedule obtained by reinforcement learning can reproduce the desired structure +more efficiently than the conventional pre-fixed temperature schedule, such as +annealing. To clarify the success of the learning, we also analyse a simple +model describing the kinetics of structural changes through the motion in a +triple-well potential. We have found that reinforcement learning autonomously +discovers the critical temperature at which structural fluctuations enhance the +chance of forming a globally stable state. The estimated policy guides the +system toward the critical temperature to assist the formation of DDQC. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ☆ Supervised Machine Learning and Physics based Machine Learning approach + for prediction of peak temperature distribution in Additive Friction Stir + Deposition of Aluminium Alloy + + +
+ Additive friction stir deposition (AFSD) is a novel solid-state additive +manufacturing technique that circumvents issues of porosity, cracking, and +properties anisotropy that plague traditional powder bed fusion and directed +energy deposition approaches. However, correlations between process parameters, +thermal profiles, and resulting microstructure in AFSD remain poorly +understood. This hinders process optimization for properties. This work employs +a cutting-edge framework combining supervised machine learning (SML) and +physics-informed neural networks (PINNs) to predict peak temperature +distribution in AFSD from process parameters. Eight regression algorithms were +implemented for SML modeling, while four PINNs leveraged governing equations +for transport, wave propagation, heat transfer, and quantum mechanics. Across +multiple statistical measures, ensemble techniques like gradient boosting +proved superior for SML, with lowest MSE of 165.78. The integrated ML approach +was also applied to classify deposition quality from process factors, with +logistic regression delivering robust accuracy. By fusing data-driven learning +and fundamental physics, this dual methodology provides comprehensive insights +into tailoring microstructure through thermal management in AFSD. The work +demonstrates the power of bridging statistical and physics-based modeling for +elucidating AM process-property relationships. + +
+
+
+
+
+ + ☆ Safe Reinforcement Learning with Dual Robustness + + +
+ Reinforcement learning (RL) agents are vulnerable to adversarial +disturbances, which can deteriorate task performance or compromise safety +specifications. Existing methods either address safety requirements under the +assumption of no adversary (e.g., safe RL) or only focus on robustness against +performance adversaries (e.g., robust RL). Learning one policy that is both +safe and robust remains a challenging open problem. The difficulty is how to +tackle two intertwined aspects in the worst cases: feasibility and optimality. +Optimality is only valid inside a feasible region, while identification of +maximal feasible region must rely on learning the optimal policy. To address +this issue, we propose a systematic framework to unify safe RL and robust RL, +including problem formulation, iteration scheme, convergence analysis and +practical algorithm design. This unification is built upon constrained +two-player zero-sum Markov games. A dual policy iteration scheme is proposed, +which simultaneously optimizes a task policy and a safety policy. The +convergence of this iteration scheme is proved. Furthermore, we design a deep +RL algorithm for practical implementation, called dually robust actor-critic +(DRAC). The evaluations with safety-critical benchmarks demonstrate that DRAC +achieves high performance and persistent safety under all scenarios (no +adversary, safety adversary, performance adversary), outperforming all +baselines significantly. + +
+
+
+
+
+ + ☆ UniBrain: Universal Brain MRI Diagnosis with Hierarchical + Knowledge-enhanced Pre-training + + +
+ Magnetic resonance imaging~(MRI) have played a crucial role in brain disease +diagnosis, with which a range of computer-aided artificial intelligence methods +have been proposed. However, the early explorations usually focus on the +limited types of brain diseases in one study and train the model on the data in +a small scale, yielding the bottleneck of generalization. Towards a more +effective and scalable paradigm, we propose a hierarchical knowledge-enhanced +pre-training framework for the universal brain MRI diagnosis, termed as +UniBrain. Specifically, UniBrain leverages a large-scale dataset of 24,770 +imaging-report pairs from routine diagnostics. Different from previous +pre-training techniques for the unitary vision or textual feature, or with the +brute-force alignment between vision and language information, we leverage the +unique characteristic of report information in different granularity to build a +hierarchical alignment mechanism, which strengthens the efficiency in feature +learning. Our UniBrain is validated on three real world datasets with severe +class imbalance and the public BraTS2019 dataset. It not only consistently +outperforms all state-of-the-art diagnostic methods by a large margin and +provides a superior grounding performance but also shows comparable performance +compared to expert radiologists on certain disease types. + +
+
+
+
+
+ + ☆ Comparative Analysis of Contextual Relation Extraction based on Deep + Learning Models + + +
+ Contextual Relation Extraction (CRE) is mainly used for constructing a +knowledge graph with a help of ontology. It performs various tasks such as +semantic search, query answering, and textual entailment. Relation extraction +identifies the entities from raw texts and the relations among them. An +efficient and accurate CRE system is essential for creating domain knowledge in +the biomedical industry. Existing Machine Learning and Natural Language +Processing (NLP) techniques are not suitable to predict complex relations from +sentences that consist of more than two relations and unspecified entities +efficiently. In this work, deep learning techniques have been used to identify +the appropriate semantic relation based on the context from multiple sentences. +Even though various machine learning models have been used for relation +extraction, they provide better results only for binary relations, i.e., +relations occurred exactly between the two entities in a sentence. Machine +learning models are not suited for complex sentences that consist of the words +that have various meanings. To address these issues, hybrid deep learning +models have been used to extract the relations from complex sentence +effectively. This paper explores the analysis of various deep learning models +that are used for relation extraction. + +
+
+ comment: This Paper Presented in the International Conference on FOSS + Approaches towards Computational Intelligence and Language TTechnolog on + February 2023, Thiruvananthapuram +
+
+
+
+
+ + ☆ FedDIP: Federated Learning with Extreme Dynamic Pruning and Incremental + Regularization ICDM 2023 + + +
+ Federated Learning (FL) has been successfully adopted for distributed +training and inference of large-scale Deep Neural Networks (DNNs). However, +DNNs are characterized by an extremely large number of parameters, thus, +yielding significant challenges in exchanging these parameters among +distributed nodes and managing the memory. Although recent DNN compression +methods (e.g., sparsification, pruning) tackle such challenges, they do not +holistically consider an adaptively controlled reduction of parameter exchange +while maintaining high accuracy levels. We, therefore, contribute with a novel +FL framework (coined FedDIP), which combines (i) dynamic model pruning with +error feedback to eliminate redundant information exchange, which contributes +to significant performance improvement, with (ii) incremental regularization +that can achieve \textit{extreme} sparsity of models. We provide convergence +analysis of FedDIP and report on a comprehensive performance and comparative +assessment against state-of-the-art methods using benchmark data sets and DNN +models. Our results showcase that FedDIP not only controls the model sparsity +but efficiently achieves similar or better performance compared to other model +pruning methods adopting incremental regularization during distributed model +training. The code is available at: https://github.com/EricLoong/feddip. + +
+
+ comment: Accepted for publication at ICDM 2023 (Full version in arxiv). The + associated code is available at https://github.com/EricLoong/feddip +
+
+
+
+
+ + ☆ Uncertainty-aware Traffic Prediction under Missing Data ICDM + + +
+ Traffic prediction is a crucial topic because of its broad scope of +applications in the transportation domain. Recently, various studies have +achieved promising results. However, most studies assume the prediction +locations have complete or at least partial historical records and cannot be +extended to non-historical recorded locations. In real-life scenarios, the +deployment of sensors could be limited due to budget limitations and +installation availability, which makes most current models not applicable. +Though few pieces of literature tried to impute traffic states at the missing +locations, these methods need the data simultaneously observed at the locations +with sensors, making them not applicable to prediction tasks. Another drawback +is the lack of measurement of uncertainty in prediction, making prior works +unsuitable for risk-sensitive tasks or involving decision-making. To fill the +gap, inspired by the previous inductive graph neural network, this work +proposed an uncertainty-aware framework with the ability to 1) extend +prediction to missing locations with no historical records and significantly +extend spatial coverage of prediction locations while reducing deployment of +sensors and 2) generate probabilistic prediction with uncertainty +quantification to help the management of risk and decision making in the +down-stream tasks. Through extensive experiments on real-life datasets, the +result shows our method achieved promising results on prediction tasks, and the +uncertainty quantification gives consistent results which highly correlated +with the locations with and without historical data. We also show that our +model could help support sensor deployment tasks in the transportation field to +achieve higher accuracy with a limited sensor deployment budget. + +
+
+ comment: 11 pages, 3 figures, Accepted as a short paper of IEEE International + Conference on Data Mining (ICDM) 2023 +
+
+
+
+
+ + ☆ Cognitive Mirage: A Review of Hallucinations in Large Language Models + + +
+ As large language models continue to develop in the field of AI, text +generation systems are susceptible to a worrisome phenomenon known as +hallucination. In this study, we summarize recent compelling insights into +hallucinations in LLMs. We present a novel taxonomy of hallucinations from +various text generation tasks, thus provide theoretical insights, detection +methods and improvement approaches. Based on this, future research directions +are proposed. Our contribution are threefold: (1) We provide a detailed and +complete taxonomy for hallucinations appearing in text generation tasks; (2) We +provide theoretical analyses of hallucinations in LLMs and provide existing +detection and improvement methods; (3) We propose several research directions +that can be developed in the future. As hallucinations garner significant +attention from the community, we will maintain updates on relevant research +progress. + +
+
+ comment: work in progress; 21 pages +
+
+
+
+
+ + ☆ Electricity Demand Forecasting through Natural Language Processing with + Long Short-Term Memory Networks + + +
+ Electricity demand forecasting is a well established research field. Usually +this task is performed considering historical loads, weather forecasts, +calendar information and known major events. Recently attention has been given +on the possible use of new sources of information from textual news in order to +improve the performance of these predictions. This paper proposes a Long and +Short-Term Memory (LSTM) network incorporating textual news features that +successfully predicts the deterministic and probabilistic tasks of the UK +national electricity demand. The study finds that public sentiment and word +vector representations related to transport and geopolitics have +time-continuity effects on electricity demand. The experimental results show +that the LSTM with textual features improves by more than 3% compared to the +pure LSTM benchmark and by close to 10% over the official benchmark. +Furthermore, the proposed model effectively reduces forecasting uncertainty by +narrowing the confidence interval and bringing the forecast distribution closer +to the truth. + +
+
+ comment: 5 pages, 3 figures, 2023 IEEE PES Innovative Smart Grid Technologies + Conference Europe (ISGT-Europe) +
+
+
+
+
+ + ☆ Scalable neural network models and terascale datasets for particle-flow + reconstruction + + +
+ We study scalable machine learning models for full event reconstruction in +high-energy electron-positron collisions based on a highly granular detector +simulation. Particle-flow (PF) reconstruction can be formulated as a supervised +learning task using tracks and calorimeter clusters or hits. We compare a graph +neural network and kernel-based transformer and demonstrate that both avoid +quadratic memory allocation and computational cost while achieving realistic PF +reconstruction. We show that hyperparameter tuning on a supercomputer +significantly improves the physics performance of the models. We also +demonstrate that the resulting model is highly portable across hardware +processors, supporting Nvidia, AMD, and Intel Habana cards. Finally, we +demonstrate that the model can be trained on highly granular inputs consisting +of tracks and calorimeter hits, resulting in a competitive physics performance +with the baseline. Datasets and software to reproduce the studies are published +following the findable, accessible, interoperable, and reusable (FAIR) +principles. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ Fundamental Limits of Deep Learning-Based Binary Classifiers Trained + with Hinge Loss + + +
+ Although deep learning (DL) has led to several breakthroughs in many +disciplines as diverse as chemistry, computer science, electrical engineering, +mathematics, medicine, neuroscience, and physics, a comprehensive understanding +of why and how DL is empirically successful remains fundamentally elusive. To +attack this fundamental problem and unravel the mysteries behind DL's empirical +successes, significant innovations toward a unified theory of DL have been +made. These innovations encompass nearly fundamental advances in optimization, +generalization, and approximation. Despite these advances, however, no work to +date has offered a way to quantify the testing performance of a DL-based +algorithm employed to solve a pattern classification problem. To overcome this +fundamental challenge in part, this paper exposes the fundamental testing +performance limits of DL-based binary classifiers trained with hinge loss. For +binary classifiers that are based on deep rectified linear unit (ReLU) +feedforward neural networks (FNNs) and ones that are based on deep FNNs with +ReLU and Tanh activation, we derive their respective novel asymptotic testing +performance limits. The derived testing performance limits are validated by +extensive computer experiments. + +
+
+
+
+
+ + ☆ MTD: Multi-Timestep Detector for Delayed Streaming Perception + + +
+ Autonomous driving systems require real-time environmental perception to +ensure user safety and experience. Streaming perception is a task of reporting +the current state of the world, which is used to evaluate the delay and +accuracy of autonomous driving systems. In real-world applications, factors +such as hardware limitations and high temperatures inevitably cause delays in +autonomous driving systems, resulting in the offset between the model output +and the world state. In order to solve this problem, this paper propose the +Multi- Timestep Detector (MTD), an end-to-end detector which uses dynamic +routing for multi-branch future prediction, giving model the ability to resist +delay fluctuations. A Delay Analysis Module (DAM) is proposed to optimize the +existing delay sensing method, continuously monitoring the model inference +stack and calculating the delay trend. Moreover, a novel Timestep Branch Module +(TBM) is constructed, which includes static flow and adaptive flow to +adaptively predict specific timesteps according to the delay trend. The +proposed method has been evaluated on the Argoverse-HD dataset, and the +experimental results show that it has achieved state-of-the-art performance +across various delay settings. + +
+
+ comment: 12 pages, accepted by PRCV 2023 (The 6th Chinese Conference on + Pattern Recognition and Computer Vision) +
+
+
+
+
+ + ☆ MCNS: Mining Causal Natural Structures Inside Time Series via A Novel + Internal Causality Scheme + + +
+ Causal inference permits us to discover covert relationships of various +variables in time series. However, in most existing works, the variables +mentioned above are the dimensions. The causality between dimensions could be +cursory, which hinders the comprehension of the internal relationship and the +benefit of the causal graph to the neural networks (NNs). In this paper, we +find that causality exists not only outside but also inside the time series +because it reflects a succession of events in the real world. It inspires us to +seek the relationship between internal subsequences. However, the challenges +are the hardship of discovering causality from subsequences and utilizing the +causal natural structures to improve NNs. To address these challenges, we +propose a novel framework called Mining Causal Natural Structure (MCNS), which +is automatic and domain-agnostic and helps to find the causal natural +structures inside time series via the internal causality scheme. We evaluate +the MCNS framework and impregnation NN with MCNS on time series classification +tasks. Experimental results illustrate that our impregnation, by refining +attention, shape selection classification, and pruning datasets, drives NN, +even the data itself preferable accuracy and interpretability. Besides, MCNS +provides an in-depth, solid summary of the time series and datasets. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Deep Nonparametric Convexified Filtering for Computational Photography, + Image Synthesis and Adversarial Defense + + +
+ We aim to provide a general framework of for computational photography that +recovers the real scene from imperfect images, via the Deep Nonparametric +Convexified Filtering (DNCF). It is consists of a nonparametric deep network to +resemble the physical equations behind the image formation, such as denoising, +super-resolution, inpainting, and flash. DNCF has no parameterization dependent +on training data, therefore has a strong generalization and robustness to +adversarial image manipulation. During inference, we also encourage the network +parameters to be nonnegative and create a bi-convex function on the input and +parameters, and this adapts to second-order optimization algorithms with +insufficient running time, having 10X acceleration over Deep Image Prior. With +these tools, we empirically verify its capability to defend image +classification deep networks against adversary attack algorithms in real-time. + +
+
+
+
+
+ + ☆ Bias Amplification Enhances Minority Group Performance + + +
+ Neural networks produced by standard training are known to suffer from poor +accuracy on rare subgroups despite achieving high accuracy on average, due to +the correlations between certain spurious features and labels. Previous +approaches based on worst-group loss minimization (e.g. Group-DRO) are +effective in improving worse-group accuracy but require expensive group +annotations for all the training samples. In this paper, we focus on the more +challenging and realistic setting where group annotations are only available on +a small validation set or are not available at all. We propose BAM, a novel +two-stage training algorithm: in the first stage, the model is trained using a +bias amplification scheme via introducing a learnable auxiliary variable for +each training sample; in the second stage, we upweight the samples that the +bias-amplified model misclassifies, and then continue training the same model +on the reweighted dataset. Empirically, BAM achieves competitive performance +compared with existing methods evaluated on spurious correlation benchmarks in +computer vision and natural language processing. Moreover, we find a simple +stopping criterion based on minimum class accuracy difference that can remove +the need for group annotations, with little or no loss in worst-group accuracy. +We perform extensive analyses and ablations to verify the effectiveness and +robustness of our algorithm in varying class and group imbalance ratios. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ☆ Crystal structure prediction using neural network potential and + age-fitness Pareto genetic algorithm + + +
+ While crystal structure prediction (CSP) remains a longstanding challenge, we +introduce ParetoCSP, a novel algorithm for CSP, which combines a +multi-objective genetic algorithm (MOGA) with a neural network inter-atomic +potential (IAP) model to find energetically optimal crystal structures given +chemical compositions. We enhance the NSGA-III algorithm by incorporating the +genotypic age as an independent optimization criterion and employ the M3GNet +universal IAP to guide the GA search. Compared to GN-OA, a state-of-the-art +neural potential based CSP algorithm, ParetoCSP demonstrated significantly +better predictive capabilities, outperforming by a factor of $2.562$ across +$55$ diverse benchmark structures, as evaluated by seven performance metrics. +Trajectory analysis of the traversed structures of all algorithms shows that +ParetoCSP generated more valid structures than other algorithms, which helped +guide the GA to search more effectively for the optimal structures + +
+
+
+
+
+ + ☆ Predicting Fatigue Crack Growth via Path Slicing and Re-Weighting + + +
+ Predicting potential risks associated with the fatigue of key structural +components is crucial in engineering design. However, fatigue often involves +entangled complexities of material microstructures and service conditions, +making diagnosis and prognosis of fatigue damage challenging. We report a +statistical learning framework to predict the growth of fatigue cracks and the +life-to-failure of the components under loading conditions with uncertainties. +Digital libraries of fatigue crack patterns and the remaining life are +constructed by high-fidelity physical simulations. Dimensionality reduction and +neural network architectures are then used to learn the history dependence and +nonlinearity of fatigue crack growth. Path-slicing and re-weighting techniques +are introduced to handle the statistical noises and rare events. The predicted +fatigue crack patterns are self-updated and self-corrected by the evolving +crack patterns. The end-to-end approach is validated by representative examples +with fatigue cracks in plates, which showcase the digital-twin scenario in +real-time structural health monitoring and fatigue life prediction for +maintenance management decision-making. + +
+
+
+
+
+ + ☆ VLSlice: Interactive Vision-and-Language Slice Discovery ICCV 2023 + + +
+ Recent work in vision-and-language demonstrates that large-scale pretraining +can learn generalizable models that are efficiently transferable to downstream +tasks. While this may improve dataset-scale aggregate metrics, analyzing +performance around hand-crafted subgroups targeting specific bias dimensions +reveals systemic undesirable behaviors. However, this subgroup analysis is +frequently stalled by annotation efforts, which require extensive time and +resources to collect the necessary data. Prior art attempts to automatically +discover subgroups to circumvent these constraints but typically leverages +model behavior on existing task-specific annotations and rapidly degrades on +more complex inputs beyond "tabular" data, none of which study +vision-and-language models. This paper presents VLSlice, an interactive system +enabling user-guided discovery of coherent representation-level subgroups with +consistent visiolinguistic behavior, denoted as vision-and-language slices, +from unlabeled image sets. We show that VLSlice enables users to quickly +generate diverse high-coherency slices in a user study (n=22) and release the +tool publicly. + +
+
+ comment: Conference paper at ICCV 2023. 17 pages, 11 figures. + https://ericslyman.com/vlslice/ +
+
+
+
+
+ + ☆ Tackling the Non-IID Issue in Heterogeneous Federated Learning by + Gradient Harmonization + + +
+ Federated learning (FL) is a privacy-preserving paradigm for collaboratively +training a global model from decentralized clients. However, the performance of +FL is hindered by non-independent and identically distributed (non-IID) data +and device heterogeneity. In this work, we revisit this key challenge through +the lens of gradient conflicts on the server side. Specifically, we first +investigate the gradient conflict phenomenon among multiple clients and reveal +that stronger heterogeneity leads to more severe gradient conflicts. To tackle +this issue, we propose FedGH, a simple yet effective method that mitigates +local drifts through Gradient Harmonization. This technique projects one +gradient vector onto the orthogonal plane of the other within conflicting +client pairs. Extensive experiments demonstrate that FedGH consistently +enhances multiple state-of-the-art FL baselines across diverse benchmarks and +non-IID scenarios. Notably, FedGH yields more significant improvements in +scenarios with stronger heterogeneity. As a plug-and-play module, FedGH can be +seamlessly integrated into any FL framework without requiring hyperparameter +tuning. + +
+
+
+
+
+ + ☆ Attention Loss Adjusted Prioritized Experience Replay + + +
+ Prioritized Experience Replay (PER) is a technical means of deep +reinforcement learning by selecting experience samples with more knowledge +quantity to improve the training rate of neural network. However, the +non-uniform sampling used in PER inevitably shifts the state-action space +distribution and brings the estimation error of Q-value function. In this +paper, an Attention Loss Adjusted Prioritized (ALAP) Experience Replay +algorithm is proposed, which integrates the improved Self-Attention network +with Double-Sampling mechanism to fit the hyperparameter that can regulate the +importance sampling weights to eliminate the estimation error caused by PER. In +order to verify the effectiveness and generality of the algorithm, the ALAP is +tested with value-function based, policy-gradient based and multi-agent +reinforcement learning algorithms in OPENAI gym, and comparison studies verify +the advantage and efficiency of the proposed training framework. + +
+
+
+
+
+ + ☆ Federated PAC-Bayesian Learning on Non-IID data + + +
+ Existing research has either adapted the Probably Approximately Correct (PAC) +Bayesian framework for federated learning (FL) or used information-theoretic +PAC-Bayesian bounds while introducing their theorems, but few considering the +non-IID challenges in FL. Our work presents the first non-vacuous federated +PAC-Bayesian bound tailored for non-IID local data. This bound assumes unique +prior knowledge for each client and variable aggregation weights. We also +introduce an objective function and an innovative Gibbs-based algorithm for the +optimization of the derived bound. The results are validated on real-world +datasets. + +
+
+
+
+
+ + ☆ Generalizable improvement of the Spalart-Allmaras model through + assimilation of experimental data + + +
+ This study focuses on the use of model and data fusion for improving the +Spalart-Allmaras (SA) closure model for Reynolds-averaged Navier-Stokes +solutions of separated flows. In particular, our goal is to develop of models +that not-only assimilate sparse experimental data to improve performance in +computational models, but also generalize to unseen cases by recovering +classical SA behavior. We achieve our goals using data assimilation, namely the +Ensemble Kalman Filtering approach (EnKF), to calibrate the coefficients of the +SA model for separated flows. A holistic calibration strategy is implemented +via a parameterization of the production, diffusion, and destruction terms. +This calibration relies on the assimilation of experimental data collected +velocity profiles, skin friction, and pressure coefficients for separated +flows. Despite using of observational data from a single flow condition around +a backward-facing step (BFS), the recalibrated SA model demonstrates +generalization to other separated flows, including cases such as the 2D-bump +and modified BFS. Significant improvement is observed in the quantities of +interest, i.e., skin friction coefficient ($C_f$) and pressure coefficient +($C_p$) for each flow tested. Finally, it is also demonstrated that the newly +proposed model recovers SA proficiency for external, unseparated flows, such as +flow around a NACA-0012 airfoil without any danger of extrapolation, and that +the individually calibrated terms in the SA model are targeted towards specific +flow-physics wherein the calibrated production term improves the re-circulation +zone while destruction improves the recovery zone. + +
+
+
+
+
+ + ☆ Sound field decomposition based on two-stage neural networks + + +
+ A method for sound field decomposition based on neural networks is proposed. +The method comprises two stages: a sound field separation stage and a +single-source localization stage. In the first stage, the sound pressure at +microphones synthesized by multiple sources is separated into one excited by +each sound source. In the second stage, the source location is obtained as a +regression from the sound pressure at microphones consisting of a single sound +source. The estimated location is not affected by discretization because the +second stage is designed as a regression rather than a classification. Datasets +are generated by simulation using Green's function, and the neural network is +trained for each frequency. Numerical experiments reveal that, compared with +conventional methods, the proposed method can achieve higher +source-localization accuracy and higher sound-field-reconstruction accuracy. + +
+
+ comment: 31 pages, 16 figures +
+
+
+
+
+ + ☆ Generalizable Neural Fields as Partially Observed Neural Processes ICCV 2023 + + +
+ Neural fields, which represent signals as a function parameterized by a +neural network, are a promising alternative to traditional discrete vector or +grid-based representations. Compared to discrete representations, neural +representations both scale well with increasing resolution, are continuous, and +can be many-times differentiable. However, given a dataset of signals that we +would like to represent, having to optimize a separate neural field for each +signal is inefficient, and cannot capitalize on shared information or +structures among signals. Existing generalization methods view this as a +meta-learning problem and employ gradient-based meta-learning to learn an +initialization which is then fine-tuned with test-time optimization, or learn +hypernetworks to produce the weights of a neural field. We instead propose a +new paradigm that views the large-scale training of neural representations as a +part of a partially-observed neural process framework, and leverage neural +process algorithms to solve this task. We demonstrate that this approach +outperforms both state-of-the-art gradient-based meta-learning approaches and +hypernetwork approaches. + +
+
+ comment: To appear ICCV 2023 +
+
+
+
+
+ + ☆ Dissipative Imitation Learning for Discrete Dynamic Output Feedback + Control with Sparse Data Sets + + +
+ Imitation learning enables the synthesis of controllers for complex +objectives and highly uncertain plant models. However, methods to provide +stability guarantees to imitation learned controllers often rely on large +amounts of data and/or known plant models. In this paper, we explore an +input-output (IO) stability approach to dissipative imitation learning, which +achieves stability with sparse data sets and with little known about the plant +model. A closed-loop stable dynamic output feedback controller is learned using +expert data, a coarse IO plant model, and a new constraint to enforce +dissipativity on the learned controller. While the learning objective is +nonconvex, iterative convex overbounding (ICO) and projected gradient descent +(PGD) are explored as methods to successfully learn the controller. This new +imitation learning method is applied to two unknown plants and compared to +traditionally learned dynamic output feedback controller and neural network +controller. With little knowledge of the plant model and a small data set, the +dissipativity constrained learned controller achieves closed loop stability and +successfully mimics the behavior of the expert controller, while other methods +often fail to maintain stability and achieve good performance. + +
+
+
+
+
+ + ☆ Offline Prompt Evaluation and Optimization with Inverse Reinforcement + Learning + + +
+ The recent advances in the development of Large Language Models (LLMs) like +ChatGPT have achieved remarkable performance by leveraging human expertise. +Yet, fully eliciting LLMs' potential for complex tasks requires navigating the +vast search space of natural language prompts. While prompt engineering has +shown promise, the requisite human-crafted prompts in trial-and-error attempts +and the associated costs pose significant challenges. Crucially, the efficiency +of prompt optimization hinges on the costly procedure of prompt evaluation. +This work introduces Prompt-OIRL, an approach rooted in offline inverse +reinforcement learning that seeks to bridge the gap between effective prompt +evaluation and affordability. Our method draws on offline datasets from expert +evaluations, employing Inverse-RL to derive a reward model for offline, +query-dependent prompt evaluations. The advantages of Prompt-OIRL are manifold: +it predicts prompt performance, is cost-efficient, produces human-readable +results, and efficiently navigates the prompt space. We validate our method +across four LLMs and three arithmetic datasets, highlighting its potential as a +robust and effective tool for offline prompt evaluation and optimization. Our +code as well as the offline datasets are released, and we highlight the +Prompt-OIRL can be reproduced within a few hours using a single laptop using +CPU + +
+
+
+
+
+ + ☆ Out of Distribution Detection via Domain-Informed Gaussian Process State + Space Models + + +
+ In order for robots to safely navigate in unseen scenarios using +learning-based methods, it is important to accurately detect +out-of-training-distribution (OoD) situations online. Recently, Gaussian +process state-space models (GPSSMs) have proven useful to discriminate +unexpected observations by comparing them against probabilistic predictions. +However, the capability for the model to correctly distinguish between in- and +out-of-training distribution observations hinges on the accuracy of these +predictions, primarily affected by the class of functions the GPSSM kernel can +represent. In this paper, we propose (i) a novel approach to embed existing +domain knowledge in the kernel and (ii) an OoD online runtime monitor, based on +receding-horizon predictions. Domain knowledge is assumed given as a dataset +collected either in simulation or using a nominal model. Numerical results show +that the informed kernel yields better regression quality with smaller +datasets, as compared to standard kernel choices. We demonstrate the +effectiveness of the OoD monitor on a real quadruped navigating an indoor +setting, which reliably classifies previously unseen terrains. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ ConR: Contrastive Regularizer for Deep Imbalanced Regression + + +
+ Imbalanced distributions are ubiquitous in real-world data. They create +constraints on Deep Neural Networks to represent the minority labels and avoid +bias towards majority labels. The extensive body of imbalanced approaches +address categorical label spaces but fail to effectively extend to regression +problems where the label space is continuous. Conversely, local and global +correlations among continuous labels provide valuable insights towards +effectively modelling relationships in feature space. In this work, we propose +ConR, a contrastive regularizer that models global and local label similarities +in feature space and prevents the features of minority samples from being +collapsed into their majority neighbours. Serving the similarities of the +predictions as an indicator of feature similarities, ConR discerns the +dissagreements between the label space and feature space and imposes a penalty +on these disagreements. ConR minds the continuous nature of label space with +two main strategies in a contrastive manner: incorrect proximities are +penalized proportionate to the label similarities and the correct ones are +encouraged to model local similarities. ConR consolidates essential +considerations into a generic, easy-to-integrate, and efficient method that +effectively addresses deep imbalanced regression. Moreover, ConR is orthogonal +to existing approaches and smoothly extends to uni- and multi-dimensional label +spaces. Our comprehensive experiments show that ConR significantly boosts the +performance of all the state-of-the-art methods on three large-scale deep +imbalanced regression benchmarks. Our code is publicly available in +https://github.com/BorealisAI/ConR. + +
+
+
+
+
+ + ☆ Tackling the dimensions in imaging genetics with CLUB-PLS + + +
+ A major challenge in imaging genetics and similar fields is to link +high-dimensional data in one domain, e.g., genetic data, to high dimensional +data in a second domain, e.g., brain imaging data. The standard approach in the +area are mass univariate analyses across genetic factors and imaging +phenotypes. That entails executing one genome-wide association study (GWAS) for +each pre-defined imaging measure. Although this approach has been tremendously +successful, one shortcoming is that phenotypes must be pre-defined. +Consequently, effects that are not confined to pre-selected regions of interest +or that reflect larger brain-wide patterns can easily be missed. In this work +we introduce a Partial Least Squares (PLS)-based framework, which we term +Cluster-Bootstrap PLS (CLUB-PLS), that can work with large input dimensions in +both domains as well as with large sample sizes. One key factor of the +framework is to use cluster bootstrap to provide robust statistics for single +input features in both domains. We applied CLUB-PLS to investigating the +genetic basis of surface area and cortical thickness in a sample of 33,000 +subjects from the UK Biobank. We found 107 genome-wide significant +locus-phenotype pairs that are linked to 386 different genes. We found that a +vast majority of these loci could be technically validated at a high rate: +using classic GWAS or Genome-Wide Inferred Statistics (GWIS) we found that 85 +locus-phenotype pairs exceeded the genome-wide suggestive (P<1e-05) threshold. + +
+
+ comment: 12 pages, 4 Figures, 2 Tables +
+
+
+
+
+ + ☆ Efficient Learning of PDEs via Taylor Expansion and Sparse Decomposition + into Value and Fourier Domains + + +
+ Accelerating the learning of Partial Differential Equations (PDEs) from +experimental data will speed up the pace of scientific discovery. Previous +randomized algorithms exploit sparsity in PDE updates for acceleration. However +such methods are applicable to a limited class of decomposable PDEs, which have +sparse features in the value domain. We propose Reel, which accelerates the +learning of PDEs via random projection and has much broader applicability. Reel +exploits the sparsity by decomposing dense updates into sparse ones in both the +value and frequency domains. This decomposition enables efficient learning when +the source of the updates consists of gradually changing terms across large +areas (sparse in the frequency domain) in addition to a few rapid updates +concentrated in a small set of "interfacial" regions (sparse in the value +domain). Random projection is then applied to compress the sparse signals for +learning. To expand the model applicability, Taylor series expansion is used in +Reel to approximate the nonlinear PDE updates with polynomials in the +decomposable form. Theoretically, we derive a constant factor approximation +between the projected loss function and the original one with poly-logarithmic +number of projected dimensions. Experimentally, we provide empirical evidence +that our proposed Reel can lead to faster learning of PDE models (70-98% +reduction in training time when the data is compressed to 1% of its original +size) with comparable quality as the non-compressed models. + +
+
+
+
+
+ + ☆ Efficient quantum recurrent reinforcement learning via quantum reservoir + computing + + +
+ Quantum reinforcement learning (QRL) has emerged as a framework to solve +sequential decision-making tasks, showcasing empirical quantum advantages. A +notable development is through quantum recurrent neural networks (QRNNs) for +memory-intensive tasks such as partially observable environments. However, QRL +models incorporating QRNN encounter challenges such as inefficient training of +QRL with QRNN, given that the computation of gradients in QRNN is both +computationally expensive and time-consuming. This work presents a novel +approach to address this challenge by constructing QRL agents utilizing +QRNN-based reservoirs, specifically employing quantum long short-term memory +(QLSTM). QLSTM parameters are randomly initialized and fixed without training. +The model is trained using the asynchronous advantage actor-aritic (A3C) +algorithm. Through numerical simulations, we validate the efficacy of our +QLSTM-Reservoir RL framework. Its performance is assessed on standard +benchmarks, demonstrating comparable results to a fully trained QLSTM RL model +with identical architecture and training settings. + +
+
+
+
+
+ + ☆ Reliability-based cleaning of noisy training labels with inductive + conformal prediction in multi-modal biomedical data mining + + +
+ Accurately labeling biomedical data presents a challenge. Traditional +semi-supervised learning methods often under-utilize available unlabeled data. +To address this, we propose a novel reliability-based training data cleaning +method employing inductive conformal prediction (ICP). This method capitalizes +on a small set of accurately labeled training data and leverages ICP-calculated +reliability metrics to rectify mislabeled data and outliers within vast +quantities of noisy training data. The efficacy of the method is validated +across three classification tasks within distinct modalities: filtering +drug-induced-liver-injury (DILI) literature with title and abstract, predicting +ICU admission of COVID-19 patients through CT radiomics and electronic health +records, and subtyping breast cancer using RNA-sequencing data. Varying levels +of noise to the training labels were introduced through label permutation. +Results show significant enhancements in classification performance: accuracy +enhancement in 86 out of 96 DILI experiments (up to 11.4%), AUROC and AUPRC +enhancements in all 48 COVID-19 experiments (up to 23.8% and 69.8%), and +accuracy and macro-average F1 score improvements in 47 out of 48 RNA-sequencing +experiments (up to 74.6% and 89.0%). Our method offers the potential to +substantially boost classification performance in multi-modal biomedical +machine learning tasks. Importantly, it accomplishes this without necessitating +an excessive volume of meticulously curated training data. + +
+
+
+
+
+ + ☆ Traveling Words: A Geometric Interpretation of Transformers + + +
+ Transformers have significantly advanced the field of natural language +processing, but comprehending their internal mechanisms remains a challenge. In +this paper, we introduce a novel geometric perspective that elucidates the +inner mechanisms of transformer operations. Our primary contribution is +illustrating how layer normalization confines the latent features to a +hyper-sphere, subsequently enabling attention to mold the semantic +representation of words on this surface. This geometric viewpoint seamlessly +connects established properties such as iterative refinement and contextual +embeddings. We validate our insights by probing a pre-trained 124M parameter +GPT-2 model. Our findings reveal clear query-key attention patterns in early +layers and build upon prior observations regarding the subject-specific nature +of attention heads at deeper layers. Harnessing these geometric insights, we +present an intuitive understanding of transformers, depicting them as processes +that model the trajectory of word particles along the hyper-sphere. + +
+
+
+
+
+ + ☆ User Training with Error Augmentation for Electromyogram-based Gesture + Classification + + +
+ We designed and tested a system for real-time control of a user interface by +extracting surface electromyographic (sEMG) activity from eight electrodes in a +wrist-band configuration. sEMG data were streamed into a machine-learning +algorithm that classified hand gestures in real-time. After an initial model +calibration, participants were presented with one of three types of feedback +during a human-learning stage: veridical feedback, in which predicted +probabilities from the gesture classification algorithm were displayed without +alteration, modified feedback, in which we applied a hidden augmentation of +error to these probabilities, and no feedback. User performance was then +evaluated in a series of minigames, in which subjects were required to use +eight gestures to manipulate their game avatar to complete a task. Experimental +results indicated that, relative to baseline, the modified feedback condition +led to significantly improved accuracy and improved gesture class separation. +These findings suggest that real-time feedback in a gamified user interface +with manipulation of feedback may enable intuitive, rapid, and accurate task +acquisition for sEMG-based gesture recognition applications. + +
+
+ comment: 10 pages, 10 figures +
+
+
+
+
+ + ☆ Unbiased Face Synthesis With Diffusion Models: Are We There Yet? + + +
+ Text-to-image diffusion models have achieved widespread popularity due to +their unprecedented image generation capability. In particular, their ability +to synthesize and modify human faces has spurred research into using generated +face images in both training data augmentation and model performance +assessments. In this paper, we study the efficacy and shortcomings of +generative models in the context of face generation. Utilizing a combination of +qualitative and quantitative measures, including embedding-based metrics and +user studies, we present a framework to audit the characteristics of generated +faces conditioned on a set of social attributes. We applied our framework on +faces generated through state-of-the-art text-to-image diffusion models. We +identify several limitations of face image generation that include faithfulness +to the text prompt, demographic disparities, and distributional shifts. +Furthermore, we present an analytical model that provides insights into how +training data selection contributes to the performance of generative models. + +
+
+
+
+
+ + ☆ Safe and Accelerated Deep Reinforcement Learning-based O-RAN Slicing: A + Hybrid Transfer Learning Approach + + +
+ The open radio access network (O-RAN) architecture supports intelligent +network control algorithms as one of its core capabilities. Data-driven +applications incorporate such algorithms to optimize radio access network (RAN) +functions via RAN intelligent controllers (RICs). Deep reinforcement learning +(DRL) algorithms are among the main approaches adopted in the O-RAN literature +to solve dynamic radio resource management problems. However, despite the +benefits introduced by the O-RAN RICs, the practical adoption of DRL algorithms +in real network deployments falls behind. This is primarily due to the slow +convergence and unstable performance exhibited by DRL agents upon deployment +and when facing previously unseen network conditions. In this paper, we address +these challenges by proposing transfer learning (TL) as a core component of the +training and deployment workflows for the DRL-based closed-loop control of +O-RAN functionalities. To this end, we propose and design a hybrid TL-aided +approach that leverages the advantages of both policy reuse and distillation TL +methods to provide safe and accelerated convergence in DRL-based O-RAN slicing. +We conduct a thorough experiment that accommodates multiple services, including +real VR gaming traffic to reflect practical scenarios of O-RAN slicing. We also +propose and implement policy reuse and distillation-aided DRL and non-TL-aided +DRL as three separate baselines. The proposed hybrid approach shows at least: +7.7% and 20.7% improvements in the average initial reward value and the +percentage of converged scenarios, and a 64.6% decrease in reward variance +while maintaining fast convergence and enhancing the generalizability compared +with the baselines. + +
+
+ comment: This paper has been accepted for publication in a future issue of + IEEE Journal on Selected Areas in Communications (JSAC) +
+
+
+
+
+ + ☆ Simultaneous inference for generalized linear models with unmeasured + confounders + + +
+ Tens of thousands of simultaneous hypothesis tests are routinely performed in +genomic studies to identify differentially expressed genes. However, due to +unmeasured confounders, many standard statistical approaches may be +substantially biased. This paper investigates the large-scale hypothesis +testing problem for multivariate generalized linear models in the presence of +confounding effects. Under arbitrary confounding mechanisms, we propose a +unified statistical estimation and inference framework that harnesses +orthogonal structures and integrates linear projections into three key stages. +It first leverages multivariate responses to separate marginal and uncorrelated +confounding effects, recovering the confounding coefficients' column space. +Subsequently, latent factors and primary effects are jointly estimated, +utilizing $\ell_1$-regularization for sparsity while imposing orthogonality +onto confounding coefficients. Finally, we incorporate projected and weighted +bias-correction steps for hypothesis testing. Theoretically, we establish +various effects' identification conditions and non-asymptotic error bounds. We +show effective Type-I error control of asymptotic $z$-tests as sample and +response sizes approach infinity. Numerical experiments demonstrate that the +proposed method controls the false discovery rate by the Benjamini-Hochberg +procedure and is more powerful than alternative methods. By comparing +single-cell RNA-seq counts from two groups of samples, we demonstrate the +suitability of adjusting confounding effects when significant covariates are +absent from the model. + +
+
+ comment: 61 pages, 8 figures +
+
+
+
+
+ + ☆ All you need is spin: SU(2) equivariant variational quantum circuits + based on spin networks + + +
+ Variational algorithms require architectures that naturally constrain the +optimisation space to run efficiently. In geometric quantum machine learning, +one achieves this by encoding group structure into parameterised quantum +circuits to include the symmetries of a problem as an inductive bias. However, +constructing such circuits is challenging as a concrete guiding principle has +yet to emerge. In this paper, we propose the use of spin networks, a form of +directed tensor network invariant under a group transformation, to devise SU(2) +equivariant quantum circuit ans\"atze -- circuits possessing spin rotation +symmetry. By changing to the basis that block diagonalises SU(2) group action, +these networks provide a natural building block for constructing parameterised +equivariant quantum circuits. We prove that our construction is mathematically +equivalent to other known constructions, such as those based on twirling and +generalised permutations, but more direct to implement on quantum hardware. The +efficacy of our constructed circuits is tested by solving the ground state +problem of SU(2) symmetric Heisenberg models on the one-dimensional triangular +lattice and on the Kagome lattice. Our results highlight that our equivariant +circuits boost the performance of quantum variational algorithms, indicating +broader applicability to other real-world problems. + +
+
+ comment: 36+14 pages +
+
+
+
+
+ + ☆ Autotuning Apache TVM-based Scientific Applications Using Bayesian + Optimization + + +
+ Apache TVM (Tensor Virtual Machine), an open source machine learning compiler +framework designed to optimize computations across various hardware platforms, +provides an opportunity to improve the performance of dense matrix +factorizations such as LU (Lower Upper) decomposition and Cholesky +decomposition on GPUs and AI (Artificial Intelligence) accelerators. In this +paper, we propose a new TVM autotuning framework using Bayesian Optimization +and use the TVM tensor expression language to implement linear algebra kernels +such as LU, Cholesky, and 3mm. We use these scientific computation kernels to +evaluate the effectiveness of our methods on a GPU cluster, called Swing, at +Argonne National Laboratory. We compare the proposed autotuning framework with +the TVM autotuning framework AutoTVM with four tuners and find that our +framework outperforms AutoTVM in most cases. + +
+
+
+
+
+ + ☆ EarthPT: a foundation model for Earth Observation NeurIPS + + +
+ We introduce EarthPT -- an Earth Observation (EO) pretrained transformer. +EarthPT is a 700 million parameter decoding transformer foundation model +trained in an autoregressive self-supervised manner and developed specifically +with EO use-cases in mind. We demonstrate that EarthPT is an effective +forecaster that can accurately predict future pixel-level surface reflectances +across the 400-2300 nm range well into the future. For example, forecasts of +the evolution of the Normalised Difference Vegetation Index (NDVI) have a +typical error of approximately 0.05 (over a natural range of -1 -> 1) at the +pixel level over a five month test set horizon, out-performing simple +phase-folded models based on historical averaging. We also demonstrate that +embeddings learnt by EarthPT hold semantically meaningful information and could +be exploited for downstream tasks such as highly granular, dynamic land use +classification. Excitingly, we note that the abundance of EO data provides us +with -- in theory -- quadrillions of training tokens. Therefore, if we assume +that EarthPT follows neural scaling laws akin to those derived for Large +Language Models (LLMs), there is currently no data-imposed limit to scaling +EarthPT and other similar `Large Observation Models.' + +
+
+ comment: 7 pages, 4 figures, submitted to NeurIPS CCAI workshop +
+
+
+
+
+ + ☆ Latent Representation and Simulation of Markov Processes via Time-Lagged + Information Bottleneck + + +
+ Markov processes are widely used mathematical models for describing dynamic +systems in various fields. However, accurately simulating large-scale systems +at long time scales is computationally expensive due to the short time steps +required for accurate integration. In this paper, we introduce an inference +process that maps complex systems into a simplified representational space and +models large jumps in time. To achieve this, we propose Time-lagged Information +Bottleneck (T-IB), a principled objective rooted in information theory, which +aims to capture relevant temporal features while discarding high-frequency +information to simplify the simulation task and minimize the inference error. +Our experiments demonstrate that T-IB learns information-optimal +representations for accurately modeling the statistical properties and dynamics +of the original process at a selected time lag, outperforming existing +time-lagged dimensionality reduction methods. + +
+
+ comment: 10 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Optimal transport for automatic alignment of untargeted metabolomic data + + +
+ Untargeted metabolomic profiling through liquid chromatography-mass +spectrometry (LC-MS) measures a vast array of metabolites within biospecimens, +advancing drug development, disease diagnosis, and risk prediction. However, +the low throughput of LC-MS poses a major challenge for biomarker discovery, +annotation, and experimental comparison, necessitating the merging of multiple +datasets. Current data pooling methods encounter practical limitations due to +their vulnerability to data variations and hyperparameter dependence. Here we +introduce GromovMatcher, a flexible and user-friendly algorithm that +automatically combines LC-MS datasets using optimal transport. By capitalizing +on feature intensity correlation structures, GromovMatcher delivers superior +alignment accuracy and robustness compared to existing approaches. This +algorithm scales to thousands of features requiring minimal hyperparameter +tuning. Applying our method to experimental patient studies of liver and +pancreatic cancer, we discover shared metabolic features related to patient +alcohol intake, demonstrating how GromovMatcher facilitates the search for +biomarkers associated with lifestyle risk factors linked to several cancer +types. + +
+
+ comment: 43 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Perseus: A Simple and Optimal High-Order Method for Variational + Inequalities + + +
+ This paper settles an open and challenging question pertaining to the design +of simple and optimal high-order methods for solving smooth and monotone +variational inequalities (VIs). A VI involves finding $x^\star \in \mathcal{X}$ +such that $\langle F(x), x - x^\star\rangle \geq 0$ for all $x \in +\mathcal{X}$. We consider the setting in which $F$ is smooth with up to +$(p-1)^{th}$-order derivatives. For $p = 2$, the cubic regularized Newton +method was extended to VIs with a global rate of $O(\epsilon^{-1})$. An +improved rate of $O(\epsilon^{-2/3}\log\log(1/\epsilon))$ can be obtained via +an alternative second-order method, but this method requires a nontrivial +line-search procedure as an inner loop. Similarly, high-order methods based on +line-search procedures have been shown to achieve a rate of +$O(\epsilon^{-2/(p+1)}\log\log(1/\epsilon))$. As emphasized by Nesterov, +however, such procedures do not necessarily imply practical applicability in +large-scale applications, and it would be desirable to complement these results +with a simple high-order VI method that retains the optimality of the more +complex methods. We propose a $p^{th}$-order method that does \textit{not} +require any line search procedure and provably converges to a weak solution at +a rate of $O(\epsilon^{-2/(p+1)})$. We prove that our $p^{th}$-order method is +optimal in the monotone setting by establishing a matching lower bound under a +generalized linear span assumption. Our method with restarting attains a linear +rate for smooth and strictly monotone VIs and a local superlinear rate for +smooth and strongly monotone VIs. Our method also achieves a global rate of +$O(\epsilon^{-2/p})$ for solving smooth and nonmonotone VIs satisfying the +Minty condition and when augmented with restarting it attains a global linear +and local superlinear rate for smooth and nonmonotone VIs satisfying the +strictly/strong Minty condition. + +
+
+ comment: Improve the paper significantly; 40 pages +
+
+
+
+
+ + ♻ ☆ NExT-GPT: Any-to-Any Multimodal LLM + + +
+ While recently Multimodal Large Language Models (MM-LLMs) have made exciting +strides, they mostly fall prey to the limitation of only input-side multimodal +understanding, without the ability to produce content in multiple modalities. +As we humans always perceive the world and communicate with people through +various modalities, developing any-to-any MM-LLMs capable of accepting and +delivering content in any modality becomes essential to human-level AI. To fill +the gap, we present an end-to-end general-purpose any-to-any MM-LLM system, +NExT-GPT. We connect an LLM with multimodal adaptors and different diffusion +decoders, enabling NExT-GPT to perceive inputs and generate outputs in +arbitrary combinations of text, images, videos, and audio. By leveraging the +existing well-trained highly-performing encoders and decoders, NExT-GPT is +tuned with only a small amount of parameter (1%) of certain projection layers, +which not only benefits low-cost training and also facilitates convenient +expansion to more potential modalities. Moreover, we introduce a +modality-switching instruction tuning (MosIT) and manually curate a +high-quality dataset for MosIT, based on which NExT-GPT is empowered with +complex cross-modal semantic understanding and content generation. Overall, our +research showcases the promising possibility of building an AI agent capable of +modeling universal modalities, paving the way for more human-like AI research +in the community. Project page: https://next-gpt.github.io/ + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Imprecise Bayesian Neural Networks + + +
+ Uncertainty quantification and robustness to distribution shifts are +important goals in machine learning and artificial intelligence. Although +Bayesian Neural Networks (BNNs) allow for uncertainty in the predictions to be +assessed, different sources of uncertainty are indistinguishable. We present +Imprecise Bayesian Neural Networks (IBNNs); they generalize and overcome some +of the drawbacks of standard BNNs. These latter are trained using a single +prior and likelihood distributions, whereas IBNNs are trained using credal +prior and likelihood sets. They allow to distinguish between aleatoric and +epistemic uncertainties, and to quantify them. In addition, IBNNs are more +robust than BNNs to prior and likelihood misspecification, and to distribution +shift. They can also be used to compute sets of outcomes that enjoy +probabilistic guarantees. We apply IBNNs to two case studies. One, for motion +prediction in autonomous driving scenarios, and two, to model blood glucose and +insulin dynamics for artificial pancreas control. We show that IBNNs performs +better when compared to an ensemble of BNNs benchmark. + +
+
+
+
+
+ + ♻ ☆ Optimizing Offensive Gameplan in the National Basketball Association + with Machine Learning + + +
+ Throughout the analytical revolution that has occurred in the NBA, the +development of specific metrics and formulas has given teams, coaches, and +players a new way to see the game. However - the question arises - how can we +verify any metrics? One method would simply be eyeball approximation (trying +out many different gameplans) and/or trial and error - an estimation-based and +costly approach. Another approach is to try to model already existing metrics +with a unique set of features using machine learning techniques. The key to +this approach is that with these features that are selected, we can try to +gauge the effectiveness of these features combined, rather than using +individual analysis in simple metric evaluation. If we have an accurate model, +it can particularly help us determine the specifics of gameplan execution. In +this paper, the statistic ORTG (Offensive Rating, developed by Dean Oliver) was +found to have a correlation with different NBA playtypes using both a linear +regression model and a neural network regression model, although ultimately, a +neural network worked slightly better than linear regression. Using the +accuracy of the models as a justification, the next step was to optimize the +output of the model with test examples, which would demonstrate the combination +of features to best achieve a highly functioning offense. + +
+
+ comment: 6 pages, 4 figures. Revision: Corrected text and citation formatting + issues +
+
+
+
+
+ + ♻ ☆ Efficient Sensor Placement from Regression with Sparse Gaussian + Processes in Continuous and Discrete Spaces + + +
+ The sensor placement problem is a common problem that arises when monitoring +correlated phenomena, such as temperature and precipitation. Existing +approaches to this problem typically use discrete optimization methods, which +are computationally expensive and cannot scale to large problems. We address +the sensor placement problem in correlated environments by reducing it to a +regression problem that can be efficiently solved using sparse Gaussian +processes (SGPs). Our approach can handle both discrete sensor placement +problems-where sensors are limited to a subset of a given set of locations-and +continuous sensor placement problems-where sensors can be placed anywhere in a +bounded continuous region. Our experimental results on three real-world +datasets show that our approach generates sensor placements that result in +reconstruction quality that is consistently on par or better than the prior +state-of-the-art approach while being significantly faster. Our computationally +efficient approach enables both large-scale sensor placement and fast robotic +sensor placement for informative path planning algorithms. + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Spaiche: Extending State-of-the-Art ASR Models to Swiss German Dialects + + +
+ Recent breakthroughs in NLP largely increased the presence of ASR systems in +our daily lives. However, for many low-resource languages, ASR models still +need to be improved due in part to the difficulty of acquiring pertinent data. +This project aims to help advance research in ASR models for Swiss German +dialects, by providing insights about the performance of state-of-the-art ASR +models on recently published Swiss German speech datasets. We propose a novel +loss that takes into account the semantic distance between the predicted and +the ground-truth labels. We outperform current state-of-the-art results by +fine-tuning OpenAI's Whisper model on Swiss-German datasets. + +
+
+ comment: 8 pages, SwissText conference +
+
+
+
+
+ + ♻ ☆ Switch and Conquer: Efficient Algorithms By Switching Stochastic + Gradient Oracles For Decentralized Saddle Point Problems + + +
+ We consider a class of non-smooth strongly convex-strongly concave saddle +point problems in a decentralized setting without a central server. To solve a +consensus formulation of problems in this class, we develop an inexact primal +dual hybrid gradient (inexact PDHG) procedure that allows generic gradient +computation oracles to update the primal and dual variables. We first +investigate the performance of inexact PDHG with stochastic variance reduction +gradient (SVRG) oracle. Our numerical study uncovers a significant phenomenon +of initial conservative progress of iterates of IPDHG with SVRG oracle. To +tackle this, we develop a simple and effective switching idea, where a +generalized stochastic gradient (GSG) computation oracle is employed to hasten +the iterates' progress to a saddle point solution during the initial phase of +updates, followed by a switch to the SVRG oracle at an appropriate juncture. +The proposed algorithm is named Decentralized Proximal Switching Stochastic +Gradient method with Compression (C-DPSSG), and is proven to converge to an +$\epsilon$-accurate saddle point solution with linear rate. Apart from +delivering highly accurate solutions, our study reveals that utilizing the best +convergence phases of GSG and SVRG oracles makes C-DPSSG well suited for +obtaining solutions of low/medium accuracy faster, useful for certain +applications. Numerical experiments on two benchmark machine learning +applications show C-DPSSG's competitive performance which validate our +theoretical findings. The codes used in the experiments can be found +\href{https://github.com/chhavisharma123/C-DPSSG-CDC2023}{here}. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2205.14452 +
+
+
+
+
+ + ♻ ☆ Maximum Mean Discrepancy Meets Neural Networks: The + Radon-Kolmogorov-Smirnov Test + + +
+ Maximum mean discrepancy (MMD) refers to a general class of nonparametric +two-sample tests that are based on maximizing the mean difference over samples +from one distribution $P$ versus another $Q$, over all choices of data +transformations $f$ living in some function space $\mathcal{F}$. Inspired by +recent work that connects what are known as functions of $\textit{Radon bounded +variation}$ (RBV) and neural networks (Parhi and Nowak, 2021, 2023), we study +the MMD defined by taking $\mathcal{F}$ to be the unit ball in the RBV space of +a given smoothness order $k \geq 0$. This test, which we refer to as the +$\textit{Radon-Kolmogorov-Smirnov}$ (RKS) test, can be viewed as a +generalization of the well-known and classical Kolmogorov-Smirnov (KS) test to +multiple dimensions and higher orders of smoothness. It is also intimately +connected to neural networks: we prove that the witness in the RKS test -- the +function $f$ achieving the maximum mean difference -- is always a ridge spline +of degree $k$, i.e., a single neuron in a neural network. This allows us to +leverage the power of modern deep learning toolkits to (approximately) optimize +the criterion that underlies the RKS test. We prove that the RKS test has +asymptotically full power at distinguishing any distinct pair $P \not= Q$ of +distributions, derive its asymptotic null distribution, and carry out extensive +experiments to elucidate the strengths and weakenesses of the RKS test versus +the more traditional kernel MMD test. + +
+
+
+
+
+ + ♻ ☆ ColD Fusion: Collaborative Descent for Distributed Multitask Finetuning ACL 23 + + +
+ We propose a new paradigm to continually evolve pretrained models, denoted +ColD Fusion. It provides the benefits of multitask learning but leverages +distributed computation with limited communication and eliminates the need for +shared data. Consequentially, ColD Fusion can give rise to a synergistic loop, +where finetuned models can be recycled to continually improve the pretrained +model they are based upon. We show that ColD Fusion yields comparable benefits +to multitask training by producing a model that (a) attains strong performance +on all of the datasets it was trained on; and (b) is a better starting point +for finetuning on unseen datasets. We show that ColD Fusion outperforms RoBERTa +and even previous multitask models. Specifically, when training and testing on +35 diverse datasets, ColD Fusion-based model outperforms RoBERTa by 2.33 points +on average without any changes to the architecture. + +
+
+ comment: ACL 23 +
+
+
+
+
+ + ♻ ☆ Deep Visual-Genetic Biometrics for Taxonomic Classification of Rare + Species + + +
+ Visual as well as genetic biometrics are routinely employed to identify +species and individuals in biological applications. However, no attempts have +been made in this domain to computationally enhance visual classification of +rare classes with little image data via genetics. In this paper, we thus +propose aligned visual-genetic inference spaces with the aim to implicitly +encode cross-domain associations for improved performance. We demonstrate for +the first time that such alignment can be achieved via deep embedding models +and that the approach is directly applicable to boosting long-tailed +recognition (LTR) particularly for rare species. We experimentally demonstrate +the efficacy of the concept via application to microscopic imagery of 30k+ +planktic foraminifer shells across 32 species when used together with +independent genetic data samples. Most importantly for practitioners, we show +that visual-genetic alignment can significantly benefit visual-only recognition +of the rarest species. Technically, we pre-train a visual ResNet50 deep +learning model using triplet loss formulations to create an initial embedding +space. We re-structure this space based on genetic anchors embedded via a +Sequence Graph Transform (SGT) and linked to visual data by cross-domain cosine +alignment. We show that an LTR approach improves the state-of-the-art across +all benchmarks and that adding our visual-genetic alignment improves per-class +and particularly rare tail class benchmarks significantly further. We conclude +that visual-genetic alignment can be a highly effective tool for complementing +visual biological data containing rare classes. The concept proposed may serve +as an important future tool for integrating genetics and imageomics towards a +more complete scientific representation of taxonomic spaces and life itself. +Code, weights, and data splits are published for full reproducibility. + +
+
+
+
+
+ + ♻ ☆ A Spectral Analysis of Graph Neural Networks on Dense and Sparse Graphs ICASSP 2024 + + +
+ In this work we propose a random graph model that can produce graphs at +different levels of sparsity. We analyze how sparsity affects the graph +spectra, and thus the performance of graph neural networks (GNNs) in node +classification on dense and sparse graphs. We compare GNNs with spectral +methods known to provide consistent estimators for community detection on dense +graphs, a closely related task. We show that GNNs can outperform spectral +methods on sparse graphs, and illustrate these results with numerical examples +on both synthetic and real graphs. + +
+
+ comment: Extended version of ICASSP 2024 submission +
+
+
+
+
+ + ♻ ☆ Online Submodular Maximization via Online Convex Optimization + + +
+ We study monotone submodular maximization under general matroid constraints +in the online setting. We prove that online optimization of a large class of +submodular functions, namely, weighted threshold potential functions, reduces +to online convex optimization (OCO). This is precisely because functions in +this class admit a concave relaxation; as a result, OCO policies, coupled with +an appropriate rounding scheme, can be used to achieve sublinear regret in the +combinatorial setting. We show that our reduction extends to many different +versions of the online learning problem, including the dynamic regret, bandit, +and optimistic-learning settings. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Fixed points of nonnegative neural networks + + +
+ We use fixed point theory to analyze nonnegative neural networks, which we +define as neural networks that map nonnegative vectors to nonnegative vectors. +We first show that nonnegative neural networks with nonnegative weights and +biases can be recognized as monotonic and (weakly) scalable functions within +the framework of nonlinear Perron-Frobenius theory. This fact enables us to +provide conditions for the existence of fixed points of nonnegative neural +networks having inputs and outputs of the same dimension, and these conditions +are weaker than those recently obtained using arguments in convex analysis. +Furthermore, we prove that the shape of the fixed point set of nonnegative +neural networks with nonnegative weights and biases is an interval, which under +mild conditions degenerates to a point. These results are then used to obtain +the existence of fixed points of more general nonnegative neural networks. From +a practical perspective, our results contribute to the understanding of the +behavior of autoencoders, and the main theoretical results are verified in +numerical simulations using the Modified National Institute of Standards and +Technology (MNIST) dataset. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ♻ ☆ Generalization error bounds for iterative learning algorithms with + bounded updates + + +
+ This paper explores the generalization characteristics of iterative learning +algorithms with bounded updates for non-convex loss functions, employing +information-theoretic techniques. Our key contribution is a novel bound for the +generalization error of these algorithms with bounded updates, extending beyond +the scope of previous works that only focused on Stochastic Gradient Descent +(SGD). Our approach introduces two main novelties: 1) we reformulate the mutual +information as the uncertainty of updates, providing a new perspective, and 2) +instead of using the chaining rule of mutual information, we employ a variance +decomposition technique to decompose information across iterations, allowing +for a simpler surrogate process. We analyze our generalization bound under +various settings and demonstrate improved bounds when the model dimension +increases at the same rate as the number of training data samples. To bridge +the gap between theory and practice, we also examine the previously observed +scaling behavior in large language models. Ultimately, our work takes a further +step for developing practical generalization theories. + +
+
+
+
+
+ + ♻ ☆ An Empirical Evaluation of Temporal Graph Benchmark + + +
+ In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark +(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with +TGB, we include eleven popular dynamic graph learning methods for more +exhaustive comparisons. Through the experiments, we find that (1) different +models depict varying performance across various datasets, which is in line +with previous observations; (2) the performance of some baselines can be +significantly improved over the reported results in TGB when using DyGLib. This +work aims to ease the researchers' efforts in evaluating various dynamic graph +learning methods on TGB and attempts to offer results that can be directly +referenced in the follow-up research. All the used resources in this project +are publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is +in progress, and feedback from the community is welcomed for improvements. + +
+
+ comment: in progress, more results are added +
+
+
+
+
+ + ♻ ☆ Learning Horn Envelopes via Queries from Large Language Models + + +
+ We investigate an approach for extracting knowledge from trained neural +networks based on Angluin's exact learning model with membership and +equivalence queries to an oracle. In this approach, the oracle is a trained +neural network. We consider Angluin's classical algorithm for learning Horn +theories and study the necessary changes to make it applicable to learn from +neural networks. In particular, we have to consider that trained neural +networks may not behave as Horn oracles, meaning that their underlying target +theory may not be Horn. We propose a new algorithm that aims at extracting the +"tightest Horn approximation" of the target theory and that is guaranteed to +terminate in exponential time (in the worst case) and in polynomial time if the +target has polynomially many non-Horn examples. To showcase the applicability +of the approach, we perform experiments on pre-trained language models and +extract rules that expose occupation-based gender biases. + +
+
+ comment: 35 pages, 2 figures; manuscript accepted for publication in the + International Journal of Approximate Reasoning (IJAR) +
+
+
+
+
+ + ♻ ☆ CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency + Model ACM MM 2023 + + +
+ Denoising diffusion probabilistic models (DDPMs) have shown promising +performance for speech synthesis. However, a large number of iterative steps +are required to achieve high sample quality, which restricts the inference +speed. Maintaining sample quality while increasing sampling speed has become a +challenging task. In this paper, we propose a "Co"nsistency "Mo"del-based +"Speech" synthesis method, CoMoSpeech, which achieve speech synthesis through a +single diffusion sampling step while achieving high audio quality. The +consistency constraint is applied to distill a consistency model from a +well-designed diffusion-based teacher model, which ultimately yields superior +performances in the distilled CoMoSpeech. Our experiments show that by +generating audio recordings by a single sampling step, the CoMoSpeech achieves +an inference speed more than 150 times faster than real-time on a single NVIDIA +A100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based +speech synthesis truly practical. Meanwhile, objective and subjective +evaluations on text-to-speech and singing voice synthesis show that the +proposed teacher models yield the best audio quality, and the one-step sampling +based CoMoSpeech achieves the best inference speed with better or comparable +audio quality to other conventional multi-step diffusion model baselines. Audio +samples are available at https://comospeech.github.io/. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Interactive Hyperparameter Optimization in Multi-Objective Problems via + Preference Learning + + +
+ Hyperparameter optimization (HPO) is important to leverage the full potential +of machine learning (ML). In practice, users are often interested in +multi-objective (MO) problems, i.e., optimizing potentially conflicting +objectives, like accuracy and energy consumption. To tackle this, the vast +majority of MO-ML algorithms return a Pareto front of non-dominated machine +learning models to the user. Optimizing the hyperparameters of such algorithms +is non-trivial as evaluating a hyperparameter configuration entails evaluating +the quality of the resulting Pareto front. In literature, there are known +indicators that assess the quality of a Pareto front (e.g., hypervolume, R2) by +quantifying different properties (e.g., volume, proximity to a reference +point). However, choosing the indicator that leads to the desired Pareto front +might be a hard task for a user. In this paper, we propose a human-centered +interactive HPO approach tailored towards multi-objective ML leveraging +preference learning to extract desiderata from users that guide the +optimization. Instead of relying on the user guessing the most suitable +indicator for their needs, our approach automatically learns an appropriate +indicator. Concretely, we leverage pairwise comparisons of distinct Pareto +fronts to learn such an appropriate quality indicator. Then, we optimize the +hyperparameters of the underlying MO-ML algorithm towards this learned +indicator using a state-of-the-art HPO approach. In an experimental study +targeting the environmental impact of ML, we demonstrate that our approach +leads to substantially better Pareto fronts compared to optimizing based on a +wrong indicator pre-selected by the user, and performs comparable in the case +of an advanced user knowing which indicator to pick. + +
+
+
+
+
+ + ♻ ☆ Can you text what is happening? Integrating pre-trained language + encoders into trajectory prediction models for autonomous driving + + +
+ In autonomous driving tasks, scene understanding is the first step towards +predicting the future behavior of the surrounding traffic participants. Yet, +how to represent a given scene and extract its features are still open research +questions. In this study, we propose a novel text-based representation of +traffic scenes and process it with a pre-trained language encoder. + First, we show that text-based representations, combined with classical +rasterized image representations, lead to descriptive scene embeddings. Second, +we benchmark our predictions on the nuScenes dataset and show significant +improvements compared to baselines. Third, we show in an ablation study that a +joint encoder of text and rasterized images outperforms the individual encoders +confirming that both representations have their complementary strengths. + +
+
+
+
+
+ + ♻ ☆ RL4CO: an Extensive Reinforcement Learning for Combinatorial + Optimization Benchmark + + +
+ We introduce RL4CO, an extensive reinforcement learning (RL) for +combinatorial optimization (CO) benchmark. RL4CO employs state-of-the-art +software libraries as well as best practices in implementation, such as +modularity and configuration management, to be efficient and easily modifiable +by researchers for adaptations of neural network architecture, environments, +and RL algorithms. Contrary to the existing focus on specific tasks like the +traveling salesman problem (TSP) for performance assessment, we underline the +importance of scalability and generalization capabilities for diverse CO tasks. +We also systematically benchmark zero-shot generalization, sample efficiency, +and adaptability to changes in data distributions of various models. Our +experiments show that some recent SOTA methods fall behind their predecessors +when evaluated using these metrics, suggesting the necessity for a more +balanced view of the performance of neural CO (NCO) solvers. We hope RL4CO will +encourage the exploration of novel solutions to complex real-world tasks, +allowing the NCO community to compare with existing methods through a +standardized interface that decouples the science from software engineering. We +make our library publicly available at https://github.com/kaist-silab/rl4co. + +
+
+ comment: Added several improvements to the writing; add search methods; new + results +
+
+
+
+
+ + ♻ ☆ Neural Vortex Method: from Finite Lagrangian Particles to Infinite + Dimensional Eulerian Dynamics + + +
+ In the field of fluid numerical analysis, there has been a long-standing +problem: lacking of a rigorous mathematical tool to map from a continuous flow +field to discrete vortex particles, hurdling the Lagrangian particles from +inheriting the high resolution of a large-scale Eulerian solver. To tackle this +challenge, we propose a novel learning-based framework, the Neural Vortex +Method (NVM), which builds a neural-network description of the Lagrangian +vortex structures and their interaction dynamics to reconstruct the +high-resolution Eulerian flow field in a physically-precise manner. The key +components of our infrastructure consist of two networks: a vortex +representation network to identify the Lagrangian vortices from a grid-based +velocity field and a vortex interaction network to learn the underlying +governing dynamics of these finite structures. By embedding these two networks +with a vorticity-to-velocity Poisson solver and training its parameters using +the high-fidelity data obtained from high-resolution direct numerical +simulation, we can predict the accurate fluid dynamics on a precision level +that was infeasible for all the previous conventional vortex methods (CVMs). To +the best of our knowledge, our method is the first approach that can utilize +motions of finite particles to learn infinite dimensional dynamic systems. We +demonstrate the efficacy of our method in generating highly accurate prediction +results, with low computational cost, of the leapfrogging vortex rings system, +the turbulence system, and the systems governed by Euler equations with +different external forces. + +
+
+
+
+
+ + ♻ ☆ Improved Prognostic Prediction of Pancreatic Cancer Using Multi-Phase CT + by Integrating Neural Distance and Texture-Aware Transformer MICCAI 2023 + + +
+ Pancreatic ductal adenocarcinoma (PDAC) is a highly lethal cancer in which +the tumor-vascular involvement greatly affects the resectability and, thus, +overall survival of patients. However, current prognostic prediction methods +fail to explicitly and accurately investigate relationships between the tumor +and nearby important vessels. This paper proposes a novel learnable neural +distance that describes the precise relationship between the tumor and vessels +in CT images of different patients, adopting it as a major feature for +prognosis prediction. Besides, different from existing models that used CNNs or +LSTMs to exploit tumor enhancement patterns on dynamic contrast-enhanced CT +imaging, we improved the extraction of dynamic tumor-related texture features +in multi-phase contrast-enhanced CT by fusing local and global features using +CNN and transformer modules, further enhancing the features extracted across +multi-phase CT images. We extensively evaluated and compared the proposed +method with existing methods in the multi-center (n=4) dataset with 1,070 +patients with PDAC, and statistical analysis confirmed its clinical +effectiveness in the external test set consisting of three centers. The +developed risk marker was the strongest predictor of overall survival among +preoperative factors and it has the potential to be combined with established +clinical factors to select patients at higher risk who might benefit from +neoadjuvant therapy. + +
+
+ comment: MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Knockoffs-SPR: Clean Sample Selection in Learning with Noisy Labels + + +
+ A noisy training set usually leads to the degradation of the generalization +and robustness of neural networks. In this paper, we propose a novel +theoretically guaranteed clean sample selection framework for learning with +noisy labels. Specifically, we first present a Scalable Penalized Regression +(SPR) method, to model the linear relation between network features and one-hot +labels. In SPR, the clean data are identified by the zero mean-shift parameters +solved in the regression model. We theoretically show that SPR can recover +clean data under some conditions. Under general scenarios, the conditions may +be no longer satisfied; and some noisy data are falsely selected as clean data. +To solve this problem, we propose a data-adaptive method for Scalable Penalized +Regression with Knockoff filters (Knockoffs-SPR), which is provable to control +the False-Selection-Rate (FSR) in the selected clean data. To improve the +efficiency, we further present a split algorithm that divides the whole +training set into small pieces that can be solved in parallel to make the +framework scalable to large datasets. While Knockoffs-SPR can be regarded as a +sample selection module for a standard supervised training pipeline, we further +combine it with a semi-supervised algorithm to exploit the support of noisy +data as unlabeled data. Experimental results on several benchmark datasets and +real-world noisy datasets show the effectiveness of our framework and validate +the theoretical results of Knockoffs-SPR. Our code and pre-trained models are +available at https://github.com/Yikai-Wang/Knockoffs-SPR. + +
+
+ comment: update: refined theory and analysis, release code +
+
+
+
+
+ + ♻ ☆ Decentralized Federated Learning: Fundamentals, State of the Art, + Frameworks, Trends, and Challenges + + +
+ In recent years, Federated Learning (FL) has gained relevance in training +collaborative models without sharing sensitive data. Since its birth, +Centralized FL (CFL) has been the most common approach in the literature, where +a central entity creates a global model. However, a centralized approach leads +to increased latency due to bottlenecks, heightened vulnerability to system +failures, and trustworthiness concerns affecting the entity responsible for the +global model creation. Decentralized Federated Learning (DFL) emerged to +address these concerns by promoting decentralized model aggregation and +minimizing reliance on centralized architectures. However, despite the work +done in DFL, the literature has not (i) studied the main aspects +differentiating DFL and CFL; (ii) analyzed DFL frameworks to create and +evaluate new solutions; and (iii) reviewed application scenarios using DFL. +Thus, this article identifies and analyzes the main fundamentals of DFL in +terms of federation architectures, topologies, communication mechanisms, +security approaches, and key performance indicators. Additionally, the paper at +hand explores existing mechanisms to optimize critical DFL fundamentals. Then, +the most relevant features of the current DFL frameworks are reviewed and +compared. After that, it analyzes the most used DFL application scenarios, +identifying solutions based on the fundamentals and frameworks previously +defined. Finally, the evolution of existing DFL solutions is studied to provide +a list of trends, lessons learned, and open challenges. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Neural Networks for an optimal counterdiabatic quantum + computation + + +
+ We introduce a novel methodology that leverages the strength of +Physics-Informed Neural Networks (PINNs) to address the counterdiabatic (CD) +protocol in the optimization of quantum circuits comprised of systems with +$N_{Q}$ qubits. The primary objective is to utilize physics-inspired deep +learning techniques to accurately solve the time evolution of the different +physical observables within the quantum system. To accomplish this objective, +we embed the necessary physical information into an underlying neural network +to effectively tackle the problem. In particular, we impose the hermiticity +condition on all physical observables and make use of the principle of least +action, guaranteeing the acquisition of the most appropriate counterdiabatic +terms based on the underlying physics. The proposed approach offers a +dependable alternative to address the CD driving problem, free from the +constraints typically encountered in previous methodologies relying on +classical numerical approximations. Our method provides a general framework to +obtain optimal results from the physical observables relevant to the problem, +including the external parameterization in time known as scheduling function, +the gauge potential or operator involving the non-adiabatic terms, as well as +the temporal evolution of the energy levels of the system, among others. The +main applications of this methodology have been the $\mathrm{H_{2}}$ and +$\mathrm{LiH}$ molecules, represented by a 2-qubit and 4-qubit systems +employing the STO-3G basis. The presented results demonstrate the successful +derivation of a desirable decomposition for the non-adiabatic terms, achieved +through a linear combination utilizing Pauli operators. This attribute confers +significant advantages to its practical implementation within quantum computing +algorithms. + +
+
+ comment: 28 pages, 10 figures, 1 algorithm, 1 table +
+
+
+
+
+ + ♻ ☆ Anisotropic Diffusion Stencils: From Simple Derivations over Stability + Estimates to ResNet Implementations + + +
+ Anisotropic diffusion processes with a diffusion tensor are important in +image analysis, physics, and engineering. However, their numerical +approximation has a strong impact on dissipative artefacts and deviations from +rotation invariance. In this work, we study a large family of finite difference +discretisations on a 3 x 3 stencil. We derive it by splitting 2-D anisotropic +diffusion into four 1-D diffusions. The resulting stencil class involves one +free parameter and covers a wide range of existing discretisations. It +comprises the full stencil family of Weickert et al. (2013) and shows that +their two parameters contain redundancy. Furthermore, we establish a bound on +the spectral norm of the matrix corresponding to the stencil. This gives time +step size limits that guarantee stability of an explicit scheme in the +Euclidean norm. Our directional splitting also allows a very natural +translation of the explicit scheme into ResNet blocks. Employing neural network +libraries enables simple and highly efficient parallel implementations on GPUs. + +
+
+
+
+
+ + ♻ ☆ Learning a Universal Human Prior for Dexterous Manipulation from Human + Preference + + +
+ Generating human-like behavior on robots is a great challenge especially in +dexterous manipulation tasks with robotic hands. Scripting policies from +scratch is intractable due to the high-dimensional control space, and training +policies with reinforcement learning (RL) and manual reward engineering can +also be hard and lead to unnatural motions. Leveraging the recent progress on +RL from Human Feedback, we propose a framework that learns a universal human +prior using direct human preference feedback over videos, for efficiently +tuning the RL policies on 20 dual-hand robot manipulation tasks in simulation, +without a single human demonstration. A task-agnostic reward model is trained +through iteratively generating diverse polices and collecting human preference +over the trajectories; it is then applied for regularizing the behavior of +polices in the fine-tuning stage. Our method empirically demonstrates more +human-like behaviors on robot hands in diverse tasks including even unseen +tasks, indicating its generalization capability. + +
+
+
+
+
+ + ♻ ☆ Distilling Cognitive Backdoor Patterns within an Image ICLR2023 + + +
+ This paper proposes a simple method to distill and detect backdoor patterns +within an image: \emph{Cognitive Distillation} (CD). The idea is to extract the +"minimal essence" from an input image responsible for the model's prediction. +CD optimizes an input mask to extract a small pattern from the input image that +can lead to the same model output (i.e., logits or deep features). The +extracted pattern can help understand the cognitive mechanism of a model on +clean vs. backdoor images and is thus called a \emph{Cognitive Pattern} (CP). +Using CD and the distilled CPs, we uncover an interesting phenomenon of +backdoor attacks: despite the various forms and sizes of trigger patterns used +by different attacks, the CPs of backdoor samples are all surprisingly and +suspiciously small. One thus can leverage the learned mask to detect and remove +backdoor examples from poisoned training datasets. We conduct extensive +experiments to show that CD can robustly detect a wide range of advanced +backdoor attacks. We also show that CD can potentially be applied to help +detect potential biases from face datasets. Code is available at +\url{https://github.com/HanxunH/CognitiveDistillation}. + +
+
+ comment: ICLR2023 +
+
+
+
+
+ + ♻ ☆ A Worker-Task Specialization Model for Crowdsourcing: Efficient + Inference and Fundamental Limits + + +
+ Crowdsourcing system has emerged as an effective platform for labeling data +with relatively low cost by using non-expert workers. Inferring correct labels +from multiple noisy answers on data, however, has been a challenging problem, +since the quality of the answers varies widely across tasks and workers. Many +existing works have assumed that there is a fixed ordering of workers in terms +of their skill levels, and focused on estimating worker skills to aggregate the +answers from workers with different weights. In practice, however, the worker +skill changes widely across tasks, especially when the tasks are heterogeneous. +In this paper, we consider a new model, called $d$-type specialization model, +in which each task and worker has its own (unknown) type and the reliability of +each worker can vary in the type of a given task and that of a worker. We allow +that the number $d$ of types can scale in the number of tasks. In this model, +we characterize the optimal sample complexity to correctly infer the labels +within any given accuracy, and propose label inference algorithms achieving the +order-wise optimal limit even when the types of tasks or those of workers are +unknown. We conduct experiments both on synthetic and real datasets, and show +that our algorithm outperforms the existing algorithms developed based on more +strict model assumptions. + +
+
+ comment: To appear at IEEE Transactions on Information Theory +
+
+
+
+
+ + ♻ ☆ Few-shot Personalized Saliency Prediction Based on Inter-personnel Gaze + Patterns + + +
+ This paper presents few-shot personalized saliency prediction based on +inter-personnel gaze patterns. In contrast to a general saliency map, a +personalized saliecny map (PSM) has been great potential since its map +indicates the person-specific visual attention that is useful for obtaining +individual visual preferences from heterogeneity of gazed areas. The PSM +prediction is needed for acquiring the PSM for the unseen image, but its +prediction is still a challenging task due to the complexity of individual gaze +patterns. For modeling individual gaze patterns for various images, although +the eye-tracking data obtained from each person is necessary to construct PSMs, +it is difficult to acquire the massive amounts of such data. Here, one solution +for efficient PSM prediction from the limited amount of data can be the +effective use of eye-tracking data obtained from other persons. In this paper, +to effectively treat the PSMs of other persons, we focus on the effective +selection of images to acquire eye-tracking data and the preservation of +structural information of PSMs of other persons. In the experimental results, +we confirm that the above two focuses are effective for the PSM prediction with +the limited amount of eye-tracking data. + +
+
+ comment: 5pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Nearest Neighbor Sampling of Point Sets using Rays + + +
+ We propose a new framework for the sampling, compression, and analysis of +distributions of point sets and other geometric objects embedded in Euclidean +spaces. Our approach involves constructing a tensor called the RaySense sketch, +which captures nearest neighbors from the underlying geometry of points along a +set of rays. We explore various operations that can be performed on the +RaySense sketch, leading to different properties and potential applications. +Statistical information about the data set can be extracted from the sketch, +independent of the ray set. Line integrals on point sets can be efficiently +computed using the sketch. We also present several examples illustrating +applications of the proposed strategy in practical scenarios. + +
+
+ comment: 48 pages, 14 figures, accepted to Communication on Applied + Mathematics and Computation (CAMC), Focused Issue in Honor of Prof. Stanley + Osher on the Occasion of His 80th Birthday. Fixed typos and improved + notations +
+
+
+
+
+ + ♻ ☆ Your Diffusion Model is Secretly a Zero-Shot Classifier ICCV 2023 + + +
+ The recent wave of large-scale text-to-image diffusion models has +dramatically increased our text-based image generation abilities. These models +can generate realistic images for a staggering variety of prompts and exhibit +impressive compositional generalization abilities. Almost all use cases thus +far have solely focused on sampling; however, diffusion models can also provide +conditional density estimates, which are useful for tasks beyond image +generation. In this paper, we show that the density estimates from large-scale +text-to-image diffusion models like Stable Diffusion can be leveraged to +perform zero-shot classification without any additional training. Our +generative approach to classification, which we call Diffusion Classifier, +attains strong results on a variety of benchmarks and outperforms alternative +methods of extracting knowledge from diffusion models. Although a gap remains +between generative and discriminative approaches on zero-shot recognition +tasks, our diffusion-based approach has significantly stronger multimodal +compositional reasoning ability than competing discriminative approaches. +Finally, we use Diffusion Classifier to extract standard classifiers from +class-conditional diffusion models trained on ImageNet. Our models achieve +strong classification performance using only weak augmentations and exhibit +qualitatively better "effective robustness" to distribution shift. Overall, our +results are a step toward using generative over discriminative models for +downstream tasks. Results and visualizations at +https://diffusion-classifier.github.io/ + +
+
+ comment: In ICCV 2023. Website at https://diffusion-classifier.github.io/ +
+
+
+
+
+ + ♻ ☆ Selection of contributing factors for predicting landslide + susceptibility using machine learning and deep learning models + + +
+ Landslides are a common natural disaster that can cause casualties, property +safety threats and economic losses. Therefore, it is important to understand or +predict the probability of landslide occurrence at potentially risky sites. A +commonly used means is to carry out a landslide susceptibility assessment based +on a landslide inventory and a set of landslide contributing factors. This can +be readily achieved using machine learning (ML) models such as logistic +regression (LR), support vector machine (SVM), random forest (RF), extreme +gradient boosting (Xgboost), or deep learning (DL) models such as convolutional +neural network (CNN) and long short time memory (LSTM). As the input data for +these models, landslide contributing factors have varying influences on +landslide occurrence. Therefore, it is logically feasible to select more +important contributing factors and eliminate less relevant ones, with the aim +of increasing the prediction accuracy of these models. However, selecting more +important factors is still a challenging task and there is no generally +accepted method. Furthermore, the effects of factor selection using various +methods on the prediction accuracy of ML and DL models are unclear. In this +study, the impact of the selection of contributing factors on the accuracy of +landslide susceptibility predictions using ML and DL models was investigated. +Four methods for selecting contributing factors were considered for all the +aforementioned ML and DL models, which included Information Gain Ratio (IGR), +Recursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least +Absolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization +(HHO). In addition, autoencoder-based factor selection methods for DL models +were also investigated. To assess their performances, an exhaustive approach +was adopted,... + +
+
+ comment: Stochastic Environmental Research and Risk Assessment +
+
+
+
+
+ + ♻ ☆ RIFLE: Imputation and Robust Inference from Low Order Marginals + + +
+ The ubiquity of missing values in real-world datasets poses a challenge for +statistical inference and can prevent similar datasets from being analyzed in +the same study, precluding many existing datasets from being used for new +analyses. While an extensive collection of packages and algorithms have been +developed for data imputation, the overwhelming majority perform poorly if +there are many missing values and low sample sizes, which are unfortunately +common characteristics in empirical data. Such low-accuracy estimations +adversely affect the performance of downstream statistical models. We develop a +statistical inference framework for regression and classification in the +presence of missing data without imputation. Our framework, RIFLE (Robust +InFerence via Low-order moment Estimations), estimates low-order moments of the +underlying data distribution with corresponding confidence intervals to learn a +distributionally robust model. We specialize our framework to linear regression +and normal discriminant analysis, and we provide convergence and performance +guarantees. This framework can also be adapted to impute missing data. In +numerical experiments, we compare RIFLE to several state-of-the-art approaches +(including MICE, Amelia, MissForest, KNN-imputer, MIDA, and Mean Imputer) for +imputation and inference in the presence of missing values. Our experiments +demonstrate that RIFLE outperforms other benchmark algorithms when the +percentage of missing values is high and/or when the number of data points is +relatively small. RIFLE is publicly available at +https://github.com/optimization-for-data-driven-science/RIFLE. + +
+
+ comment: 36 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Unifying over-smoothing and over-squashing in graph neural networks: A + physics informed approach and beyond + + +
+ Graph Neural Networks (GNNs) have emerged as one of the leading approaches +for machine learning on graph-structured data. Despite their great success, +critical computational challenges such as over-smoothing, over-squashing, and +limited expressive power continue to impact the performance of GNNs. In this +study, inspired from the time-reversal principle commonly utilized in classical +and quantum physics, we reverse the time direction of the graph heat equation. +The resulted reversing process yields a class of high pass filtering functions +that enhance the sharpness of graph node features. Leveraging this concept, we +introduce the Multi-Scaled Heat Kernel based GNN (MHKG) by amalgamating diverse +filtering functions' effects on node features. To explore more flexible +filtering conditions, we further generalize MHKG into a model termed G-MHKG and +thoroughly show the roles of each element in controlling over-smoothing, +over-squashing and expressive power. Notably, we illustrate that all +aforementioned issues can be characterized and analyzed via the properties of +the filtering functions, and uncover a trade-off between over-smoothing and +over-squashing: enhancing node feature sharpness will make model suffer more +from over-squashing, and vice versa. Furthermore, we manipulate the time again +to show how G-MHKG can handle both two issues under mild conditions. Our +conclusive experiments highlight the effectiveness of proposed models. It +surpasses several GNN baseline models in performance across graph datasets +characterized by both homophily and heterophily. + +
+
+
+
+
+ + ♻ ☆ EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought + + +
+ Embodied AI is a crucial frontier in robotics, capable of planning and +executing action sequences for robots to accomplish long-horizon tasks in +physical environments. In this work, we introduce EmbodiedGPT, an end-to-end +multi-modal foundation model for embodied AI, empowering embodied agents with +multi-modal understanding and execution capabilities. To achieve this, we have +made the following efforts: (i) We craft a large-scale embodied planning +dataset, termed EgoCOT. The dataset consists of carefully selected videos from +the Ego4D dataset, along with corresponding high-quality language instructions. +Specifically, we generate a sequence of sub-goals with the "Chain of Thoughts" +mode for effective embodied planning. (ii) We introduce an efficient training +approach to EmbodiedGPT for high-quality plan generation, by adapting a 7B +large language model (LLM) to the EgoCOT dataset via prefix tuning. (iii) We +introduce a paradigm for extracting task-related features from LLM-generated +planning queries to form a closed loop between high-level planning and +low-level control. Extensive experiments show the effectiveness of EmbodiedGPT +on embodied tasks, including embodied planning, embodied control, visual +captioning, and visual question answering. Notably, EmbodiedGPT significantly +enhances the success rate of the embodied control task by extracting more +effective features. It has achieved a remarkable 1.6 times increase in success +rate on the Franka Kitchen benchmark and a 1.3 times increase on the Meta-World +benchmark, compared to the BLIP-2 baseline fine-tuned with the Ego4D dataset. + +
+
+
+
+
+ + ♻ ☆ Nowhere coexpanding functions + + +
+ We define a family of $C^1$ functions which we call "nowhere coexpanding +functions" that is closed under composition and includes all $C^3$ functions +with non-positive Schwarzian derivative. We establish results on the number and +nature of the fixed points of these functions, including a generalisation of a +classic result of Singer. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Preserved Edge Convolutional Neural Network for Sensitivity Enhancement + of Deuterium Metabolic Imaging (DMI) + + +
+ Purpose: Common to most MRSI techniques, the spatial resolution and the +minimal scan duration of Deuterium Metabolic Imaging (DMI) are limited by the +achievable SNR. This work presents a deep learning method for sensitivity +enhancement of DMI. + Methods: A convolutional neural network (CNN) was designed to estimate the +2H-labeled metabolite concentrations from low SNR and distorted DMI FIDs. The +CNN was trained with synthetic data that represent a range of SNR levels +typically encountered in vivo. The estimation precision was further improved by +fine-tuning the CNN with MRI-based edge-preserving regularization for each DMI +dataset. The proposed processing method, PReserved Edge ConvolutIonal neural +network for Sensitivity Enhanced DMI (PRECISE-DMI), was applied to simulation +studies and in vivo experiments to evaluate the anticipated improvements in SNR +and investigate the potential for inaccuracies. + Results: PRECISE-DMI visually improved the metabolic maps of low SNR +datasets, and quantitatively provided higher precision than the standard +Fourier reconstruction. Processing of DMI data acquired in rat brain tumor +models resulted in more precise determination of 2H-labeled lactate and +glutamate + glutamine levels, at increased spatial resolution (from >8 to 2 +$\mu$L) or shortened scan time (from 32 to 4 min) compared to standard +acquisitions. However, rigorous SD-bias analyses showed that overuse of the +edge-preserving regularization can compromise the accuracy of the results. + Conclusion: PRECISE-DMI allows a flexible trade-off between enhancing the +sensitivity of DMI and minimizing the inaccuracies. With typical settings, the +DMI sensitivity can be improved by 3-fold while retaining the capability to +detect local signal variations. + +
+
+
+
+
+ + ♻ ☆ Efficient Spatially Sparse Inference for Conditional GANs and Diffusion + Models NeurIPS 2022 + + +
+ During image editing, existing deep generative models tend to re-synthesize +the entire output from scratch, including the unedited regions. This leads to a +significant waste of computation, especially for minor editing operations. In +this work, we present Spatially Sparse Inference (SSI), a general-purpose +technique that selectively performs computation for edited regions and +accelerates various generative models, including both conditional GANs and +diffusion models. Our key observation is that users prone to gradually edit the +input image. This motivates us to cache and reuse the feature maps of the +original image. Given an edited image, we sparsely apply the convolutional +filters to the edited regions while reusing the cached features for the +unedited areas. Based on our algorithm, we further propose Sparse Incremental +Generative Engine (SIGE) to convert the computation reduction to latency +reduction on off-the-shelf hardware. With about $1\%$-area edits, SIGE +accelerates DDPM by $3.0\times$ on NVIDIA RTX 3090 and $4.6\times$ on Apple M1 +Pro GPU, Stable Diffusion by $7.2\times$ on 3090, and GauGAN by $5.6\times$ on +3090 and $5.2\times$ on M1 Pro GPU. Compared to our conference version, we +extend SIGE to accommodate attention layers and apply it to Stable Diffusion. +Additionally, we offer support for Apple M1 Pro GPU and include more results +with large and sequential edits. + +
+
+ comment: NeurIPS 2022 T-PAMI 2023 Website: https://www.cs.cmu.edu/~sige/ Code: + https://github.com/lmxyy/sige +
+
+
+
+
+ + ♻ ☆ Model Reprogramming: Resource-Efficient Cross-Domain Machine Learning + + +
+ In data-rich domains such as vision, language, and speech, deep learning +prevails to deliver high-performance task-specific models and can even learn +general task-agnostic representations for efficient finetuning to downstream +tasks. However, deep learning in resource-limited domains still faces multiple +challenges including (i) limited data, (ii) constrained model development cost, +and (iii) lack of adequate pre-trained models for effective finetuning. This +paper provides an overview of model reprogramming to bridge this gap. Model +reprogramming enables resource-efficient cross-domain machine learning by +repurposing and reusing a well-developed pre-trained model from a source domain +to solve tasks in a target domain without model finetuning, where the source +and target domains can be vastly different. In many applications, model +reprogramming outperforms transfer learning and training from scratch. This +paper elucidates the methodology of model reprogramming, summarizes existing +use cases, provides a theoretical explanation of the success of model +reprogramming, and concludes with a discussion on open-ended research questions +and opportunities. A list of model reprogramming studies is actively maintained +and updated at https://github.com/IBM/model-reprogramming. + +
+
+ comment: Survey paper on model reprogramming; Project repository: + https://github.com/IBM/model-reprogramming +
+
+
+
+
+ + ♻ ☆ BAARD: Blocking Adversarial Examples by Testing for Applicability, + Reliability and Decidability + + +
+ Adversarial defenses protect machine learning models from adversarial +attacks, but are often tailored to one type of model or attack. The lack of +information on unknown potential attacks makes detecting adversarial examples +challenging. Additionally, attackers do not need to follow the rules made by +the defender. To address this problem, we take inspiration from the concept of +Applicability Domain in cheminformatics. Cheminformatics models struggle to +make accurate predictions because only a limited number of compounds are known +and available for training. Applicability Domain defines a domain based on the +known compounds and rejects any unknown compound that falls outside the domain. +Similarly, adversarial examples start as harmless inputs, but can be +manipulated to evade reliable classification by moving outside the domain of +the classifier. We are the first to identify the similarity between +Applicability Domain and adversarial detection. Instead of focusing on unknown +attacks, we focus on what is known, the training data. We propose a simple yet +robust triple-stage data-driven framework that checks the input globally and +locally, and confirms that they are coherent with the model's output. This +framework can be applied to any classification model and is not limited to +specific attacks. We demonstrate these three stages work as one unit, +effectively detecting various attacks, even for a white-box scenario. + +
+
+
+
+
+ + ♻ ☆ On a continuous time model of gradient descent dynamics and instability + in deep learning + + +
+ The recipe behind the success of deep learning has been the combination of +neural networks and gradient-based optimization. Understanding the behavior of +gradient descent however, and particularly its instability, has lagged behind +its empirical success. To add to the theoretical tools available to study +gradient descent we propose the principal flow (PF), a continuous time flow +that approximates gradient descent dynamics. To our knowledge, the PF is the +only continuous flow that captures the divergent and oscillatory behaviors of +gradient descent, including escaping local minima and saddle points. Through +its dependence on the eigendecomposition of the Hessian the PF sheds light on +the recently observed edge of stability phenomena in deep learning. Using our +new understanding of instability we propose a learning rate adaptation method +which enables us to control the trade-off between training stability and test +set evaluation performance. + +
+
+ comment: Transactions of Machine Learning Research, 2023 +
+
+
+
+
+ + ♻ ☆ eDKM: An Efficient and Accurate Train-time Weight Clustering for Large + Language Models + + +
+ Since Large Language Models or LLMs have demonstrated high-quality +performance on many complex language tasks, there is a great interest in +bringing these LLMs to mobile devices for faster responses and better privacy +protection. However, the size of LLMs (i.e., billions of parameters) requires +highly effective compression to fit into storage-limited devices. Among many +compression techniques, weight-clustering, a form of non-linear quantization, +is one of the leading candidates for LLM compression, and supported by modern +smartphones. Yet, its training overhead is prohibitively significant for LLM +fine-tuning. Especially, Differentiable KMeans Clustering, or DKM, has shown +the state-of-the-art trade-off between compression ratio and accuracy +regression, but its large memory complexity makes it nearly impossible to apply +to train-time LLM compression. In this paper, we propose a memory-efficient DKM +implementation, eDKM powered by novel techniques to reduce the memory footprint +of DKM by orders of magnitudes. For a given tensor to be saved on CPU for the +backward pass of DKM, we compressed the tensor by applying uniquification and +sharding after checking if there is no duplicated tensor previously copied to +CPU. Our experimental results demonstrate that \prjname can fine-tune and +compress a pretrained LLaMA 7B model from 12.6 GB to 2.5 GB (3bit/weight) with +the Alpaca dataset by reducing the train-time memory footprint of a decoder +layer by 130$\times$, while delivering good accuracy on broader LLM benchmarks +(i.e., 77.7% for PIQA, 66.1% for Winograde, and so on). + +
+
+ comment: preprint +
+
+
+
+
+ + ♻ ☆ Deep Spatiotemporal Clustering: A Temporal Clustering Approach for + Multi-dimensional Climate Data ECML + + +
+ Clustering high-dimensional spatiotemporal data using an unsupervised +approach is a challenging problem for many data-driven applications. Existing +state-of-the-art methods for unsupervised clustering use different similarity +and distance functions but focus on either spatial or temporal features of the +data. Concentrating on joint deep representation learning of spatial and +temporal features, we propose Deep Spatiotemporal Clustering (DSC), a novel +algorithm for the temporal clustering of high-dimensional spatiotemporal data +using an unsupervised deep learning method. Inspired by the U-net architecture, +DSC utilizes an autoencoder integrating CNN-RNN layers to learn latent +representations of the spatiotemporal data. DSC also includes a unique layer +for cluster assignment on latent representations that uses the Student's +t-distribution. By optimizing the clustering loss and data reconstruction loss +simultaneously, the algorithm gradually improves clustering assignments and the +nonlinear mapping between low-dimensional latent feature space and +high-dimensional original data space. A multivariate spatiotemporal climate +dataset is used to evaluate the efficacy of the proposed method. Our extensive +experiments show our approach outperforms both conventional and deep +learning-based unsupervised clustering algorithms. Additionally, we compared +the proposed model with its various variants (CNN encoder, CNN autoencoder, +CNN-RNN encoder, CNN-RNN autoencoder, etc.) to get insight into using both the +CNN and RNN layers in the autoencoder, and our proposed technique outperforms +these variants in terms of clustering results. + +
+
+ comment: Accepted by the European Conference on Machine Learning and + Principles and Practice of Knowledge Discovery in Databases (ECML PKDD 2023) +
+
+
+
+
+ + ♻ ☆ A Latent Space Theory for Emergent Abilities in Large Language Models + + +
+ Languages are not created randomly but rather to communicate information. +There is a strong association between languages and their underlying meanings, +resulting in a sparse joint distribution that is heavily peaked according to +their correlations. Moreover, these peak values happen to match with the +marginal distribution of languages due to the sparsity. With the advent of LLMs +trained on big data and large models, we can now precisely assess the marginal +distribution of languages, providing a convenient means of exploring the sparse +structures in the joint distribution for effective inferences. In this paper, +we categorize languages as either unambiguous or {\epsilon}-ambiguous and +present quantitative results to demonstrate that the emergent abilities of +LLMs, such as language understanding, in-context learning, chain-of-thought +prompting, and effective instruction fine-tuning, can all be attributed to +Bayesian inference on the sparse joint distribution of languages. + +
+
+ comment: 17 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ When Are Two Lists Better than One?: Benefits and Harms in Joint + Decision-making + + +
+ Historically, much of machine learning research has focused on the +performance of the algorithm alone, but recently more attention has been +focused on optimizing joint human-algorithm performance. Here, we analyze a +specific type of human-algorithm collaboration where the algorithm has access +to a set of $n$ items, and presents a subset of size $k$ to the human, who +selects a final item from among those $k$. This scenario could model content +recommendation, route planning, or any type of labeling task. Because both the +human and algorithm have imperfect, noisy information about the true ordering +of items, the key question is: which value of $k$ maximizes the probability +that the best item will be ultimately selected? For $k=1$, performance is +optimized by the algorithm acting alone, and for $k=n$ it is optimized by the +human acting alone. Surprisingly, we show that for multiple of noise models, it +is optimal to set $k \in [2, n-1]$ - that is, there are strict benefits to +collaborating, even when the human and algorithm have equal accuracy +separately. We demonstrate this theoretically for the Mallows model and +experimentally for the Random Utilities models of noisy permutations. However, +we show this pattern is reversed when the human is anchored on the algorithm's +presented ordering - the joint system always has strictly worse performance. We +extend these results to the case where the human and algorithm differ in their +accuracy levels, showing that there always exist regimes where a more accurate +agent would strictly benefit from collaborating with a less accurate one, but +these regimes are asymmetric between the human and the algorithm's accuracy. + +
+
+
+
+
+
+
+
+ + Multimedia 13 + +
+
+
+ + ☆ Weakly-Supervised Multi-Task Learning for Audio-Visual Speaker + Verification + + +
+ In this paper, we present a methodology for achieving robust multimodal +person representations optimized for open-set audio-visual speaker +verification. Distance Metric Learning (DML) approaches have typically +dominated this problem space, owing to strong performance on new and unseen +classes. In our work, we explored multitask learning techniques to further +boost performance of the DML approach and show that an auxiliary task with weak +labels can increase the compactness of the learned speaker representation. We +also extend the Generalized end-to-end loss (GE2E) to multimodal inputs and +demonstrate that it can achieve competitive performance in an audio-visual +space. Finally, we introduce a non-synchronous audio-visual sampling random +strategy during training time that has shown to improve generalization. Our +network achieves state of the art performance for speaker verification, +reporting 0.244%, 0.252%, 0.441% Equal Error Rate (EER) on the three official +trial lists of VoxCeleb1-O/E/H, which is to our knowledge, the best published +results on VoxCeleb1-E and VoxCeleb1-H. + +
+
+
+
+
+ + ☆ UnifiedGesture: A Unified Gesture Synthesis Model for Multiple Skeletons ACM MM 2023 + + +
+ The automatic co-speech gesture generation draws much attention in computer +animation. Previous works designed network structures on individual datasets, +which resulted in a lack of data volume and generalizability across different +motion capture standards. In addition, it is a challenging task due to the weak +correlation between speech and gestures. To address these problems, we present +UnifiedGesture, a novel diffusion model-based speech-driven gesture synthesis +approach, trained on multiple gesture datasets with different skeletons. +Specifically, we first present a retargeting network to learn latent +homeomorphic graphs for different motion capture standards, unifying the +representations of various gestures while extending the dataset. We then +capture the correlation between speech and gestures based on a diffusion model +architecture using cross-local attention and self-attention to generate better +speech-matched and realistic gestures. To further align speech and gesture and +increase diversity, we incorporate reinforcement learning on the discrete +gesture units with a learned reward function. Extensive experiments show that +UnifiedGesture outperforms recent approaches on speech-driven gesture +generation in terms of CCA, FGD, and human-likeness. All code, pre-trained +models, databases, and demos are available to the public at +https://github.com/YoungSeng/UnifiedGesture. + +
+
+ comment: 16 pages, 11 figures, ACM MM 2023 +
+
+
+
+
+ + ☆ Differentiable JPEG: The Devil is in the Details WACV 2024 + + +
+ JPEG remains one of the most widespread lossy image coding methods. However, +the non-differentiable nature of JPEG restricts the application in deep +learning pipelines. Several differentiable approximations of JPEG have recently +been proposed to address this issue. This paper conducts a comprehensive review +of existing diff. JPEG approaches and identifies critical details that have +been missed by previous methods. To this end, we propose a novel diff. JPEG +approach, overcoming previous limitations. Our approach is differentiable +w.r.t. the input image, the JPEG quality, the quantization tables, and the +color conversion parameters. We evaluate the forward and backward performance +of our diff. JPEG approach against existing methods. Additionally, extensive +ablations are performed to evaluate crucial design choices. Our proposed diff. +JPEG resembles the (non-diff.) reference implementation best, significantly +surpassing the recent-best diff. approach by $3.47$dB (PSNR) on average. For +strong compression rates, we can even improve PSNR by $9.51$dB. Strong +adversarial attack results are yielded by our diff. JPEG, demonstrating the +effective gradient approximation. Our code is available at +https://github.com/necla-ml/Diff-JPEG. + +
+
+ comment: Accepted at WACV 2024. Project page: + https://christophreich1996.github.io/differentiable_jpeg/ +
+
+
+
+
+ + ☆ Video Infringement Detection via Feature Disentanglement and Mutual + Information Maximization ACM MM 2023 + + +
+ The self-media era provides us tremendous high quality videos. Unfortunately, +frequent video copyright infringements are now seriously damaging the interests +and enthusiasm of video creators. Identifying infringing videos is therefore a +compelling task. Current state-of-the-art methods tend to simply feed +high-dimensional mixed video features into deep neural networks and count on +the networks to extract useful representations. Despite its simplicity, this +paradigm heavily relies on the original entangled features and lacks +constraints guaranteeing that useful task-relevant semantics are extracted from +the features. + In this paper, we seek to tackle the above challenges from two aspects: (1) +We propose to disentangle an original high-dimensional feature into multiple +sub-features, explicitly disentangling the feature into exclusive +lower-dimensional components. We expect the sub-features to encode +non-overlapping semantics of the original feature and remove redundant +information. + (2) On top of the disentangled sub-features, we further learn an auxiliary +feature to enhance the sub-features. We theoretically analyzed the mutual +information between the label and the disentangled features, arriving at a loss +that maximizes the extraction of task-relevant information from the original +feature. + Extensive experiments on two large-scale benchmark datasets (i.e., SVD and +VCSL) demonstrate that our method achieves 90.1% TOP-100 mAP on the large-scale +SVD dataset and also sets the new state-of-the-art on the VCSL benchmark +dataset. Our code and model have been released at +https://github.com/yyyooooo/DMI/, hoping to contribute to the community. + +
+
+ comment: This paper is accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Gpachov at CheckThat! 2023: A Diverse Multi-Approach Ensemble for + Subjectivity Detection in News Articles + + +
+ The wide-spread use of social networks has given rise to subjective, +misleading, and even false information on the Internet. Thus, subjectivity +detection can play an important role in ensuring the objectiveness and the +quality of a piece of information. This paper presents the solution built by +the Gpachov team for the CLEF-2023 CheckThat! lab Task~2 on subjectivity +detection. Three different research directions are explored. The first one is +based on fine-tuning a sentence embeddings encoder model and dimensionality +reduction. The second one explores a sample-efficient few-shot learning model. +The third one evaluates fine-tuning a multilingual transformer on an altered +dataset, using data from multiple languages. Finally, the three approaches are +combined in a simple majority voting ensemble, resulting in 0.77 macro F1 on +the test set and achieving 2nd place on the English subtask. + +
+
+
+
+
+ + ☆ VEATIC: Video-based Emotion and Affect Tracking in Context Dataset + + +
+ Human affect recognition has been a significant topic in psychophysics and +computer vision. However, the currently published datasets have many +limitations. For example, most datasets contain frames that contain only +information about facial expressions. Due to the limitations of previous +datasets, it is very hard to either understand the mechanisms for affect +recognition of humans or generalize well on common cases for computer vision +models trained on those datasets. In this work, we introduce a brand new large +dataset, the Video-based Emotion and Affect Tracking in Context Dataset +(VEATIC), that can conquer the limitations of the previous datasets. VEATIC has +124 video clips from Hollywood movies, documentaries, and home videos with +continuous valence and arousal ratings of each frame via real-time annotation. +Along with the dataset, we propose a new computer vision task to infer the +affect of the selected character via both context and character information in +each video frame. Additionally, we propose a simple model to benchmark this new +computer vision task. We also compare the performance of the pretrained model +using our dataset with other similar datasets. Experiments show the competing +results of our pretrained model via VEATIC, indicating the generalizability of +VEATIC. Our dataset is available at https://veatic.github.io. + +
+
+
+
+
+ + ☆ Leveraging Foundation models for Unsupervised Audio-Visual Segmentation + + +
+ Audio-Visual Segmentation (AVS) aims to precisely outline audible objects in +a visual scene at the pixel level. Existing AVS methods require fine-grained +annotations of audio-mask pairs in supervised learning fashion. This limits +their scalability since it is time consuming and tedious to acquire such +cross-modality pixel level labels. To overcome this obstacle, in this work we +introduce unsupervised audio-visual segmentation with no need for task-specific +data annotations and model training. For tackling this newly proposed problem, +we formulate a novel Cross-Modality Semantic Filtering (CMSF) approach to +accurately associate the underlying audio-mask pairs by leveraging the +off-the-shelf multi-modal foundation models (e.g., detection [1], open-world +segmentation [2] and multi-modal alignment [3]). Guiding the proposal +generation by either audio or visual cues, we design two training-free +variants: AT-GDINO-SAM and OWOD-BIND. Extensive experiments on the AVS-Bench +dataset show that our unsupervised approach can perform well in comparison to +prior art supervised counterparts across complex scenarios with multiple +auditory objects. Particularly, in situations where existing supervised AVS +methods struggle with overlapping foreground objects, our models still excel in +accurately segmenting overlapped auditory objects. Our code will be publicly +released. + +
+
+
+
+
+ + ☆ PIAVE: A Pose-Invariant Audio-Visual Speaker Extraction Network + + +
+ It is common in everyday spoken communication that we look at the turning +head of a talker to listen to his/her voice. Humans see the talker to listen +better, so do machines. However, previous studies on audio-visual speaker +extraction have not effectively handled the varying talking face. This paper +studies how to take full advantage of the varying talking face. We propose a +Pose-Invariant Audio-Visual Speaker Extraction Network (PIAVE) that +incorporates an additional pose-invariant view to improve audio-visual speaker +extraction. Specifically, we generate the pose-invariant view from each +original pose orientation, which enables the model to receive a consistent +frontal view of the talker regardless of his/her head pose, therefore, forming +a multi-view visual input for the speaker. Experiments on the multi-view MEAD +and in-the-wild LRS3 dataset demonstrate that PIAVE outperforms the +state-of-the-art and is more robust to pose variations. + +
+
+ comment: Interspeech 2023 +
+
+
+
+
+ + ☆ AudioSR: Versatile Audio Super-resolution at Scale + + +
+ Audio super-resolution is a fundamental task that predicts high-frequency +components for low-resolution audio, enhancing audio quality in digital +applications. Previous methods have limitations such as the limited scope of +audio types (e.g., music, speech) and specific bandwidth settings they can +handle (e.g., 4kHz to 8kHz). In this paper, we introduce a diffusion-based +generative model, AudioSR, that is capable of performing robust audio +super-resolution on versatile audio types, including sound effects, music, and +speech. Specifically, AudioSR can upsample any input audio signal within the +bandwidth range of 2kHz to 16kHz to a high-resolution audio signal at 24kHz +bandwidth with a sampling rate of 48kHz. Extensive objective evaluation on +various audio super-resolution benchmarks demonstrates the strong result +achieved by the proposed model. In addition, our subjective evaluation shows +that AudioSR can acts as a plug-and-play module to enhance the generation +quality of a wide range of audio generative models, including AudioLDM, +Fastspeech2, and MusicGen. Our code and demo are available at +https://audioldm.github.io/audiosr. + +
+
+ comment: Under review. Demo and code: https://audioldm.github.io/audiosr +
+
+
+
+
+ + ♻ ☆ CoMoSpeech: One-Step Speech and Singing Voice Synthesis via Consistency + Model ACM MM 2023 + + +
+ Denoising diffusion probabilistic models (DDPMs) have shown promising +performance for speech synthesis. However, a large number of iterative steps +are required to achieve high sample quality, which restricts the inference +speed. Maintaining sample quality while increasing sampling speed has become a +challenging task. In this paper, we propose a "Co"nsistency "Mo"del-based +"Speech" synthesis method, CoMoSpeech, which achieve speech synthesis through a +single diffusion sampling step while achieving high audio quality. The +consistency constraint is applied to distill a consistency model from a +well-designed diffusion-based teacher model, which ultimately yields superior +performances in the distilled CoMoSpeech. Our experiments show that by +generating audio recordings by a single sampling step, the CoMoSpeech achieves +an inference speed more than 150 times faster than real-time on a single NVIDIA +A100 GPU, which is comparable to FastSpeech2, making diffusion-sampling based +speech synthesis truly practical. Meanwhile, objective and subjective +evaluations on text-to-speech and singing voice synthesis show that the +proposed teacher models yield the best audio quality, and the one-step sampling +based CoMoSpeech achieves the best inference speed with better or comparable +audio quality to other conventional multi-step diffusion model baselines. Audio +samples are available at https://comospeech.github.io/. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ MSAC: Multiple Speech Attribute Control Method for Reliable Speech + Emotion Recognition + + +
+ Despite significant progress, speech emotion recognition (SER) remains +challenging due to inherent complexity and ambiguity of the emotion attribute, +particularly in wild world. Whereas current studies primarily focus on +recognition and generalization abilities, this work pioneers an investigation +into the reliability of SER methods and explores the modeling of speech emotion +based on data distribution across various speech attributes. Specifically, a +novel CNN-based SER model that adopts additive margin softmax loss is first +desgined. Second, a novel multiple speech attribute control method MSAC is +proposed to explicitly control speech attributes, enabling the model to be less +affected by emotion-agnostic features and extract fine-grained emotion-related +representations. Third, we make a first attempt to examine the reliability of +our proposed unified SER workflow using the out-of-distribution detection +method. Experiments on both single and cross-corpus SER scenarios show that our +proposed unified SER workflow consistently outperforms the baseline in all +aspects. Remarkably, in single-corpus SER, the proposed SER workflow achieves +superior recognition results with a WAR of 72.97% and a UAR of 71.76% on the +IEMOCAP corpus. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Accurate Speech Emotion Recognition + + +
+ Contrastive cross-modality pretraining has recently exhibited impressive +success in diverse fields, whereas there is limited research on their merits in +speech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind +of gender-attribute-enhanced contrastive language-audio pretraining (CLAP) +method for SER. Specifically, we first construct an effective emotion CLAP +(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given +the significance of gender information in SER, two novel multi-task learning +based GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP) +models are further proposed to incorporate gender information of speech +signals, forming more reasonable objectives. Experiments on IEMOCAP indicate +that our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with +different pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP +obtains the best UAR of 81.43% and WAR of 83.16%, which performs better than +state-of-the-art SER methods by at least 3%. Our system is open-sourced on +Github. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ PointPCA: Point Cloud Objective Quality Assessment Using PCA-Based + Descriptors + + +
+ Point clouds denote a prominent solution for the representation of 3D +photo-realistic content in immersive applications. Similarly to other imaging +modalities, quality predictions for point cloud contents are vital for a wide +range of applications, enabling trade-off optimizations between data quality +and data size in every processing step from acquisition to rendering. In this +work, we focus on use cases that consider human end-users consuming point cloud +contents and, hence, we concentrate on visual quality metrics. In particular, +we propose a set of perceptually relevant descriptors based on Principal +Component Analysis (PCA) decomposition, which is applied to both geometry and +texture data for full-reference point cloud quality assessment. Statistical +features are derived from these descriptors to characterize local shape and +appearance properties for both a reference and a distorted point cloud. The +extracted statistical features are subsequently compared to provide +corresponding predictions of visual quality for the distorted point cloud. As +part of our method, a learning-based approach is proposed to fuse these +individual predictors to a unified perceptual score. We validate the accuracy +of the individual predictors, as well as the unified quality scores obtained +after regression against subjectively annotated datasets, showing that our +metric outperforms state-of-the-art solutions. Insights regarding design +decisions are provided through exploratory studies, evaluating the performance +of our metric under different parameter configurations, attribute domains, +color spaces, and regression models. A software implementation of the proposed +metric is made available at the following link: +https://github.com/cwi-dis/pointpca_suite. + +
+
+ comment: 14 pages, 7 figures, 6 tables +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 63 + +
+
+
+ + ☆ Cited Text Spans for Citation Text Generation + + +
+ Automatic related work generation must ground their outputs to the content of +the cited papers to avoid non-factual hallucinations, but due to the length of +scientific documents, existing abstractive approaches have conditioned only on +the cited paper \textit{abstracts}. We demonstrate that the abstract is not +always the most appropriate input for citation generation and that models +trained in this way learn to hallucinate. We propose to condition instead on +the \textit{cited text span} (CTS) as an alternative to the abstract. Because +manual CTS annotation is extremely time- and labor-intensive, we experiment +with automatic, ROUGE-based labeling of candidate CTS sentences, achieving +sufficiently strong performance to substitute for expensive human annotations, +and we propose a human-in-the-loop, keyword-based CTS retrieval approach that +makes generating citation texts grounded in the full text of cited papers both +promising and practical. + +
+
+
+
+
+ + ☆ Learning to Predict Concept Ordering for Common Sense Generation + + +
+ Prior work has shown that the ordering in which concepts are shown to a +commonsense generator plays an important role, affecting the quality of the +generated sentence. However, it remains a challenge to determine the optimal +ordering of a given set of concepts such that a natural sentence covering all +the concepts could be generated from a pretrained generator. To understand the +relationship between the ordering of the input concepts and the quality of the +generated sentences, we conduct a systematic study considering multiple +language models (LMs) and concept ordering strategies. We find that BART-large +model consistently outperforms all other LMs considered in this study when +fine-tuned using the ordering of concepts as they appear in CommonGen training +data as measured using multiple evaluation metrics. Moreover, the larger +GPT3-based large language models (LLMs) variants do not necessarily outperform +much smaller LMs on this task, even when fine-tuned on task-specific training +data. Interestingly, human annotators significantly reorder input concept sets +when manually writing sentences covering those concepts, and this ordering +provides the best sentence generations independently of the LM used for the +generation, outperforming a probabilistic concept ordering baseline + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Re-Reading Improves Reasoning in Language Models + + +
+ Reasoning presents a significant and challenging issue for Large Language +Models (LLMs). The predominant focus of research has revolved around developing +diverse prompting strategies to guide and structure the reasoning processes of +LLMs. However, these approaches based on decoder-only causal language models +often operate the input question in a single forward pass, potentially missing +the rich, back-and-forth interactions inherent in human reasoning. Scant +attention has been paid to a critical dimension, i.e., the input question +itself embedded within the prompts. In response, we introduce a deceptively +simple yet highly effective prompting strategy, termed question "re-reading". +Drawing inspiration from human learning and problem-solving, re-reading entails +revisiting the question information embedded within input prompts. This +approach aligns seamlessly with the cognitive principle of reinforcement, +enabling LLMs to extract deeper insights, identify intricate patterns, +establish more nuanced connections, and ultimately enhance their reasoning +capabilities across various tasks. Experiments conducted on a series of +reasoning benchmarks serve to underscore the effectiveness and generality of +our method. Moreover, our findings demonstrate that our approach seamlessly +integrates with various language models, though-eliciting prompting methods, +and ensemble techniques, further underscoring its versatility and compatibility +in the realm of LLMs. + +
+
+ comment: 25 pages +
+
+
+
+
+ + ☆ The first step is the hardest: Pitfalls of Representing and Tokenizing + Temporal Data for Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable generalization +across diverse tasks, leading individuals to increasingly use them as personal +assistants and universal computing engines. Nevertheless, a notable obstacle +emerges when feeding numerical/temporal data into these models, such as data +sourced from wearables or electronic health records. LLMs employ tokenizers in +their input that break down text into smaller units. However, tokenizers are +not designed to represent numerical values and might struggle to understand +repetitive patterns and context, treating consecutive values as separate tokens +and disregarding their temporal relationships. Here, we discuss recent works +that employ LLMs for human-centric tasks such as in mobile health sensing and +present a case study showing that popular LLMs tokenize temporal data +incorrectly. To address that, we highlight potential solutions such as prompt +tuning with lightweight embedding layers as well as multimodal adapters, that +can help bridge this "modality gap". While the capability of language models to +generalize to other modalities with minimal or no finetuning is exciting, this +paper underscores the fact that their outputs cannot be meaningful if they +stumble over input nuances. + +
+
+ comment: Accepted at the Generative AI for Pervasive Computing Symposium + (GenAI4PC) at UbiComp 2023 +
+
+
+
+
+ + ☆ Human Action Co-occurrence in Lifestyle Vlogs using Graph Link + Prediction + + +
+ We introduce the task of automatic human action co-occurrence identification, +i.e., determine whether two human actions can co-occur in the same interval of +time. We create and make publicly available the ACE (Action Co-occurrencE) +dataset, consisting of a large graph of ~12k co-occurring pairs of visual +actions and their corresponding video clips. We describe graph link prediction +models that leverage visual and textual information to automatically infer if +two actions are co-occurring. We show that graphs are particularly well suited +to capture relations between human actions, and the learned graph +representations are effective for our task and capture novel and relevant +information across different data domains. The ACE dataset and the code +introduced in this paper are publicly available at +https://github.com/MichiganNLP/vlog_action_co-occurrence. + +
+
+
+
+
+ + ☆ Improving and Evaluating the Detection of Fragmentation in News + Recommendations with the Clustering of News Story Chains RecSys 2023 + + +
+ News recommender systems play an increasingly influential role in shaping +information access within democratic societies. However, tailoring +recommendations to users' specific interests can result in the divergence of +information streams. Fragmented access to information poses challenges to the +integrity of the public sphere, thereby influencing democracy and public +discourse. The Fragmentation metric quantifies the degree of fragmentation of +information streams in news recommendations. Accurate measurement of this +metric requires the application of Natural Language Processing (NLP) to +identify distinct news events, stories, or timelines. This paper presents an +extensive investigation of various approaches for quantifying Fragmentation in +news recommendations. These approaches are evaluated both intrinsically, by +measuring performance on news story clustering, and extrinsically, by assessing +the Fragmentation scores of different simulated news recommender scenarios. Our +findings demonstrate that agglomerative hierarchical clustering coupled with +SentenceBERT text representation is substantially better at detecting +Fragmentation than earlier implementations. Additionally, the analysis of +simulated scenarios yields valuable insights and recommendations for +stakeholders concerning the measurement and interpretation of Fragmentation. + +
+
+ comment: Cite published version: Polimeno et. al., Improving and Evaluating + the Detection of Fragmentation in News Recommendations with the Clustering of + News Story Chains, NORMalize 2023: The First Workshop on the Normative Design + and Evaluation of Recommender Systems, September 19, 2023, co-located with + the ACM Conference on Recommender Systems 2023 (RecSys 2023), Singapore +
+
+
+
+
+ + ☆ Glancing Future for Simultaneous Machine Translation ICASSP 2024 + + +
+ Simultaneous machine translation (SiMT) outputs translation while reading the +source sentence. Unlike conventional sequence-to-sequence (seq2seq) training, +existing SiMT methods adopt the prefix-to-prefix (prefix2prefix) training, +where the model predicts target tokens based on partial source tokens. However, +the prefix2prefix training diminishes the ability of the model to capture +global information and introduces forced predictions due to the absence of +essential source information. Consequently, it is crucial to bridge the gap +between the prefix2prefix training and seq2seq training to enhance the +translation capability of the SiMT model. In this paper, we propose a novel +method that glances future in curriculum learning to achieve the transition +from the seq2seq training to prefix2prefix training. Specifically, we gradually +reduce the available source information from the whole sentence to the prefix +corresponding to that latency. Our method is applicable to a wide range of SiMT +methods and experiments demonstrate that our method outperforms strong +baselines. + +
+
+ comment: 5 pages, 4 figure, Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity + Recognition and Linking + + +
+ This paper presents a novel approach to address the Entity Recognition and +Linking Challenge at NLPCC 2015. The task involves extracting named entity +mentions from short search queries and linking them to entities within a +reference Chinese knowledge base. To tackle this problem, we first expand the +existing knowledge base and utilize external knowledge to identify candidate +entities, thereby improving the recall rate. Next, we extract features from the +candidate entities and utilize Support Vector Regression and Multiple Additive +Regression Tree as scoring functions to filter the results. Additionally, we +apply rules to further refine the results and enhance precision. Our method is +computationally efficient and achieves an F1 score of 0.535. + +
+
+
+
+
+ + ☆ Overview of GUA-SPA at IberLEF 2023: Guarani-Spanish Code Switching + Analysis + + +
+ We present the first shared task for detecting and analyzing code-switching +in Guarani and Spanish, GUA-SPA at IberLEF 2023. The challenge consisted of +three tasks: identifying the language of a token, NER, and a novel task of +classifying the way a Spanish span is used in the code-switched context. We +annotated a corpus of 1500 texts extracted from news articles and tweets, +around 25 thousand tokens, with the information for the tasks. Three teams took +part in the evaluation phase, obtaining in general good results for Task 1, and +more mixed results for Tasks 2 and 3. + +
+
+
+
+
+ + ☆ Prompting4Debugging: Red-Teaming Text-to-Image Diffusion Models by + Finding Problematic Prompts + + +
+ Text-to-image diffusion models, e.g. Stable Diffusion (SD), lately have shown +remarkable ability in high-quality content generation, and become one of the +representatives for the recent wave of transformative AI. Nevertheless, such +advance comes with an intensifying concern about the misuse of this generative +technology, especially for producing copyrighted or NSFW (i.e. not safe for +work) images. Although efforts have been made to filter inappropriate +images/prompts or remove undesirable concepts/styles via model fine-tuning, the +reliability of these safety mechanisms against diversified problematic prompts +remains largely unexplored. In this work, we propose Prompting4Debugging (P4D) +as a debugging and red-teaming tool that automatically finds problematic +prompts for diffusion models to test the reliability of a deployed safety +mechanism. We demonstrate the efficacy of our P4D tool in uncovering new +vulnerabilities of SD models with safety mechanisms. Particularly, our result +shows that around half of prompts in existing safe prompting benchmarks which +were originally considered "safe" can actually be manipulated to bypass many +deployed safety mechanisms, including concept removal, negative prompt, and +safety guidance. Our findings suggest that, without comprehensive testing, the +evaluations on limited safe prompting benchmarks can lead to a false sense of +safety for text-to-image models. + +
+
+
+
+
+ + ☆ Measuring vagueness and subjectivity in texts: from symbolic to neural + VAGO + + +
+ We present a hybrid approach to the automated measurement of vagueness and +subjectivity in texts. We first introduce the expert system VAGO, we illustrate +it on a small benchmark of fact vs. opinion sentences, and then test it on the +larger French press corpus FreSaDa to confirm the higher prevalence of +subjective markers in satirical vs. regular texts. We then build a neural clone +of VAGO, based on a BERT-like architecture, trained on the symbolic VAGO scores +obtained on FreSaDa. Using explainability tools (LIME), we show the interest of +this neural version for the enrichment of the lexicons of the symbolic version, +and for the production of versions in other languages. + +
+
+ comment: Paper to appear in the Proceedings of the 2023 IEEE International + Conference on Web Intelligence and Intelligent Agent Technology (WI-IAT) +
+
+
+
+
+ + ☆ Annotating Data for Fine-Tuning a Neural Ranker? Current Active Learning + Strategies are not Better than Random Selection SIGIR + + +
+ Search methods based on Pretrained Language Models (PLM) have demonstrated +great effectiveness gains compared to statistical and early neural ranking +models. However, fine-tuning PLM-based rankers requires a great amount of +annotated training data. Annotating data involves a large manual effort and +thus is expensive, especially in domain specific tasks. In this paper we +investigate fine-tuning PLM-based rankers under limited training data and +budget. We investigate two scenarios: fine-tuning a ranker from scratch, and +domain adaptation starting with a ranker already fine-tuned on general data, +and continuing fine-tuning on a target dataset. We observe a great variability +in effectiveness when fine-tuning on different randomly selected subsets of +training data. This suggests that it is possible to achieve effectiveness gains +by actively selecting a subset of the training data that has the most positive +effect on the rankers. This way, it would be possible to fine-tune effective +PLM rankers at a reduced annotation budget. To investigate this, we adapt +existing Active Learning (AL) strategies to the task of fine-tuning PLM rankers +and investigate their effectiveness, also considering annotation and +computational costs. Our extensive analysis shows that AL strategies do not +significantly outperform random selection of training subsets in terms of +effectiveness. We further find that gains provided by AL strategies come at the +expense of more assessments (thus higher annotation costs) and AL strategies +underperform random selection when comparing effectiveness given a fixed +annotation cost. Our results highlight that ``optimal'' subsets of training +data that provide high effectiveness at low annotation cost do exist, but +current mainstream AL strategies applied to PLM rankers are not capable of +identifying them. + +
+
+ comment: Accepted at SIGIR-AP 2023 +
+
+
+
+
+ + ☆ AstroLLaMA: Towards Specialized Foundation Models in Astronomy AACL 2023 + + +
+ Large language models excel in many human-language tasks but often falter in +highly specialized domains like scholarly astronomy. To bridge this gap, we +introduce AstroLLaMA, a 7-billion-parameter model fine-tuned from LLaMA-2 using +over 300,000 astronomy abstracts from arXiv. Optimized for traditional causal +language modeling, AstroLLaMA achieves a 30% lower perplexity than Llama-2, +showing marked domain adaptation. Our model generates more insightful and +scientifically relevant text completions and embedding extraction than +state-of-the-arts foundation models despite having significantly fewer +parameters. AstroLLaMA serves as a robust, domain-specific model with broad +fine-tuning potential. Its public release aims to spur astronomy-focused +research, including automatic paper summarization and conversational agent +development. + +
+
+ comment: 6 pages, 3 figures, submitted to IJCNLP-AACL 2023. Comments are + welcome. The model can be found on Hugging Face - + https://huggingface.co/universeTBD/astrollama +
+
+
+
+
+ + ☆ Characterizing Latent Perspectives of Media Houses Towards Public + Figures + + +
+ Media houses reporting on public figures, often come with their own biases +stemming from their respective worldviews. A characterization of these +underlying patterns helps us in better understanding and interpreting news +stories. For this, we need diverse or subjective summarizations, which may not +be amenable for classifying into predefined class labels. This work proposes a +zero-shot approach for non-extractive or generative characterizations of person +entities from a corpus using GPT-2. We use well-articulated articles from +several well-known news media houses as a corpus to build a sound argument for +this approach. First, we fine-tune a GPT-2 pre-trained language model with a +corpus where specific person entities are characterized. Second, we further +fine-tune this with demonstrations of person entity characterizations, created +from a corpus of programmatically constructed characterizations. This twice +fine-tuned model is primed with manual prompts consisting of entity names that +were not previously encountered in the second fine-tuning, to generate a simple +sentence about the entity. The results were encouraging, when compared against +actual characterizations from the corpus. + +
+
+
+
+
+ + ☆ Towards Visual Taxonomy Expansion + + +
+ Taxonomy expansion task is essential in organizing the ever-increasing volume +of new concepts into existing taxonomies. Most existing methods focus +exclusively on using textual semantics, leading to an inability to generalize +to unseen terms and the "Prototypical Hypernym Problem." In this paper, we +propose Visual Taxonomy Expansion (VTE), introducing visual features into the +taxonomy expansion task. We propose a textual hypernymy learning task and a +visual prototype learning task to cluster textual and visual semantics. In +addition to the tasks on respective modalities, we introduce a hyper-proto +constraint that integrates textual and visual semantics to produce fine-grained +visual semantics. Our method is evaluated on two datasets, where we obtain +compelling results. Specifically, on the Chinese taxonomy dataset, our method +significantly improves accuracy by 8.75 %. Additionally, our approach performs +better than ChatGPT on the Chinese taxonomy dataset. + +
+
+ comment: ACMMM accepted paper +
+
+
+
+
+ + ☆ Measuring Catastrophic Forgetting in Cross-Lingual Transfer Paradigms: + Exploring Tuning Strategies + + +
+ The cross-lingual transfer is a promising technique to solve tasks in +less-resourced languages. In this empirical study, we compare two fine-tuning +approaches combined with zero-shot and full-shot learning approaches for large +language models in a cross-lingual setting. As fine-tuning strategies, we +compare parameter-efficient adapter methods with fine-tuning of all parameters. +As cross-lingual transfer strategies, we compare the intermediate-training +(\textit{IT}) that uses each language sequentially and cross-lingual validation +(\textit{CLV}) that uses a target language already in the validation phase of +fine-tuning. We assess the success of transfer and the extent of catastrophic +forgetting in a source language due to cross-lingual transfer, i.e., how much +previously acquired knowledge is lost when we learn new information in a +different language. The results on two different classification problems, hate +speech detection and product reviews, each containing datasets in several +languages, show that the \textit{IT} cross-lingual strategy outperforms +\textit{CLV} for the target language. Our findings indicate that, in the +majority of cases, the \textit{CLV} strategy demonstrates superior retention of +knowledge in the base language (English) compared to the \textit{IT} strategy, +when evaluating catastrophic forgetting in multiple cross-lingual transfers. + +
+
+
+
+
+ + ☆ BHASA: A Holistic Southeast Asian Linguistic and Cultural Evaluation + Suite for Large Language Models + + +
+ The rapid development of Large Language Models (LLMs) and the emergence of +novel abilities with scale have necessitated the construction of holistic, +diverse and challenging benchmarks such as HELM and BIG-bench. However, at the +moment, most of these benchmarks focus only on performance in English and +evaluations that include Southeast Asian (SEA) languages are few in number. We +therefore propose BHASA, a holistic linguistic and cultural evaluation suite +for LLMs in SEA languages. It comprises three components: (1) a NLP benchmark +covering eight tasks across Natural Language Understanding (NLU), Generation +(NLG) and Reasoning (NLR) tasks, (2) LINDSEA, a linguistic diagnostic toolkit +that spans the gamut of linguistic phenomena including syntax, semantics and +pragmatics, and (3) a cultural diagnostics dataset that probes for both +cultural representation and sensitivity. For this preliminary effort, we +implement the NLP benchmark only for Indonesian, Vietnamese, Thai and Tamil, +and we only include Indonesian and Tamil for LINDSEA and the cultural +diagnostics dataset. As GPT-4 is purportedly one of the best-performing +multilingual LLMs at the moment, we use it as a yardstick to gauge the +capabilities of LLMs in the context of SEA languages. Our initial experiments +on GPT-4 with BHASA find it lacking in various aspects of linguistic +capabilities, cultural representation and sensitivity in the targeted SEA +languages. BHASA is a work in progress and will continue to be improved and +expanded in the future. + +
+
+ comment: 86 pages, 7 figures +
+
+
+
+
+ + ☆ RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic + Program Repair + + +
+ Automatic program repair (APR) is crucial to reduce manual debugging efforts +for developers and improve software reliability. While conventional +search-based techniques typically rely on heuristic rules or a redundancy +assumption to mine fix patterns, recent years have witnessed the surge of deep +learning (DL) based approaches to automate the program repair process in a +data-driven manner. However, their performance is often limited by a fixed set +of parameters to model the highly complex search space of APR. To ease such +burden on the parametric models, in this work, we propose a novel +Retrieval-Augmented Patch Generation framework (RAP-Gen) by explicitly +leveraging relevant fix patterns retrieved from a codebase of previous bug-fix +pairs. Specifically, we build a hybrid patch retriever to account for both +lexical and semantic matching based on the raw source code in a +language-agnostic manner, which does not rely on any code-specific features. In +addition, we adapt a code-aware language model CodeT5 as our foundation model +to facilitate both patch retrieval and generation tasks in a unified manner. We +adopt a stage-wise approach where the patch retriever first retrieves a +relevant external bug-fix pair to augment the buggy input for the CodeT5 patch +generator, which synthesizes a ranked list of repair patch candidates. Notably, +RAP-Gen is a generic APR framework that can flexibly integrate different patch +retrievers and generators to repair various types of bugs. We thoroughly +evaluate RAP-Gen on three benchmarks in two programming languages, including +the TFix benchmark in JavaScript, and Code Refinement and Defects4J benchmarks +in Java, where the bug localization information may or may not be provided. +Experimental results show that RAP-Gen significantly outperforms previous +state-of-the-art approaches on all benchmarks, e.g., repairing 15 more bugs on +818 Defects4J bugs. + +
+
+ comment: FSE 2023, Long paper +
+
+
+
+
+ + ☆ How does representation impact in-context learning: A exploration on a + synthetic task + + +
+ In-context learning, i.e., learning from in-context samples, is an impressive +ability of Transformer. However, the mechanism driving the in-context learning +is not yet fully understood. In this study, we aim to investigate from an +underexplored perspective of representation learning. The representation is +more complex for in-context learning senario, where the representation can be +impacted by both model weights and in-context samples. We refer the above two +conceptually aspects of representation as in-weight component and in-context +component, respectively. To study how the two components affect in-context +learning capabilities, we construct a novel synthetic task, making it possible +to device two probes, in-weights probe and in-context probe, to evaluate the +two components, respectively. We demonstrate that the goodness of in-context +component is highly related to the in-context learning performance, which +indicates the entanglement between in-context learning and representation +learning. Furthermore, we find that a good in-weights component can actually +benefit the learning of the in-context component, indicating that in-weights +learning should be the foundation of in-context learning. To further understand +the the in-context learning mechanism and importance of the in-weights +component, we proof by construction that a simple Transformer, which uses +pattern matching and copy-past mechanism to perform in-context learning, can +match the in-context learning performance with more complex, best tuned +Transformer under the perfect in-weights component assumption. In short, those +discoveries from representation learning perspective shed light on new +approaches to improve the in-context capacity. + +
+
+
+
+
+ + ☆ Content Reduction, Surprisal and Information Density Estimation for Long + Documents + + +
+ Many computational linguistic methods have been proposed to study the +information content of languages. We consider two interesting research +questions: 1) how is information distributed over long documents, and 2) how +does content reduction, such as token selection and text summarization, affect +the information density in long documents. We present four criteria for +information density estimation for long documents, including surprisal, +entropy, uniform information density, and lexical density. Among those +criteria, the first three adopt the measures from information theory. We +propose an attention-based word selection method for clinical notes and study +machine summarization for multiple-domain documents. Our findings reveal the +systematic difference in information density of long text in various domains. +Empirical results on automated medical coding from long clinical notes show the +effectiveness of the attention-based word selection method. + +
+
+
+
+
+ + ☆ Circuit Breaking: Removing Model Behaviors with Targeted Ablation + + +
+ Language models often exhibit behaviors that improve performance on a +pre-training objective but harm performance on downstream tasks. We propose a +novel approach to removing undesirable behaviors by ablating a small number of +causal pathways between model components, with the intention of disabling the +computational circuit responsible for the bad behavior. Given a small dataset +of inputs where the model behaves poorly, we learn to ablate a small number of +important causal pathways. In the setting of reducing GPT-2 toxic language +generation, we find ablating just 12 of the 11.6K causal edges mitigates toxic +generation with minimal degradation of performance on other inputs. + +
+
+
+
+
+ + ☆ Evaluating the Ebb and Flow: An In-depth Analysis of Question-Answering + Trends across Diverse Platforms + + +
+ Community Question Answering (CQA) platforms steadily gain popularity as they +provide users with fast responses to their queries. The swiftness of these +responses is contingent on a mixture of query-specific and user-related +elements. This paper scrutinizes these contributing factors within the context +of six highly popular CQA platforms, identified through their standout +answering speed. Our investigation reveals a correlation between the time taken +to yield the first response to a question and several variables: the metadata, +the formulation of the questions, and the level of interaction among users. +Additionally, by employing conventional machine learning models to analyze +these metadata and patterns of user interaction, we endeavor to predict which +queries will receive their initial responses promptly. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ The Moral Machine Experiment on Large Language Models + + +
+ As large language models (LLMs) become more deeply integrated into various +sectors, understanding how they make moral judgments has become crucial, +particularly in the realm of autonomous driving. This study utilized the Moral +Machine framework to investigate the ethical decision-making tendencies of +prominent LLMs, including GPT-3.5, GPT-4, PaLM 2, and Llama 2, comparing their +responses to human preferences. While LLMs' and humans' preferences such as +prioritizing humans over pets and favoring saving more lives are broadly +aligned, PaLM 2 and Llama 2, especially, evidence distinct deviations. +Additionally, despite the qualitative similarities between the LLM and human +preferences, there are significant quantitative disparities, suggesting that +LLMs might lean toward more uncompromising decisions, compared to the milder +inclinations of humans. These insights elucidate the ethical frameworks of LLMs +and their potential implications for autonomous driving. + +
+
+ comment: 12 pages, 2 Figures +
+
+
+
+
+ + ☆ Balanced and Explainable Social Media Analysis for Public Health with + Large Language Models + + +
+ As social media becomes increasingly popular, more and more public health +activities emerge, which is worth noting for pandemic monitoring and government +decision-making. Current techniques for public health analysis involve popular +models such as BERT and large language models (LLMs). Although recent progress +in LLMs has shown a strong ability to comprehend knowledge by being fine-tuned +on specific domain datasets, the costs of training an in-domain LLM for every +specific public health task are especially expensive. Furthermore, such kinds +of in-domain datasets from social media are generally highly imbalanced, which +will hinder the efficiency of LLMs tuning. To tackle these challenges, the data +imbalance issue can be overcome by sophisticated data augmentation methods for +social media datasets. In addition, the ability of the LLMs can be effectively +utilised by prompting the model properly. In light of the above discussion, in +this paper, a novel ALEX framework is proposed for social media analysis on +public health. Specifically, an augmentation pipeline is developed to resolve +the data imbalance issue. Furthermore, an LLMs explanation mechanism is +proposed by prompting an LLM with the predicted results from BERT models. +Extensive experiments conducted on three tasks at the Social Media Mining for +Health 2023 (SMM4H) competition with the first ranking in two tasks demonstrate +the superior performance of the proposed ALEX method. Our code has been +released in https://github.com/YanJiangJerry/ALEX. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2309.04213 +
+
+
+
+
+ + ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities across a variety of vision and multimodal +tasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box +setting, requiring access to model parameters for backpropagation. However, +many VLMs rely on proprietary data and are not open-source, which restricts the +use of white-box approaches for fine-tuning. Given that popular private large +language models (LLMs) like ChatGPT still offer a language-based user +interface, we aim to develop a novel fine-tuning approach for VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or output logits. In this setup, we propose employing +chat-based LLMs as black-box optimizers to search for the best text prompt on +the illustrative task of few-shot image classification using CLIP. +Specifically, we adopt an automatic "hill-climbing" procedure that converges on +an effective prompt by evaluating the accuracy of current prompts and asking +LLMs to refine them based on textual feedback, all within a conversational +process without human-in-the-loop. In a challenging 1-shot learning setup, our +simple approach surpasses the white-box continuous prompting method CoOp by an +average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms OpenAI's manually crafted prompts and is more efficient than other +black-box methods like iterative APE. Additionally, we highlight the advantage +of conversational feedback incorporating both positive and negative prompts, +suggesting that LLMs can utilize the implicit "gradient" direction in textual +feedback for a more efficient search. Lastly, we find that the text prompts +generated through our strategy are not only more interpretable but also +transfer well across different CLIP architectures in a black-box manner. + +
+
+
+
+
+ + ☆ Answering Subjective Induction Questions on Products by Summarizing + Multi-sources Multi-viewpoints Knowledge + + +
+ This paper proposes a new task in the field of Answering Subjective Induction +Question on Products (SUBJPQA). The answer to this kind of question is +non-unique, but can be interpreted from many perspectives. For example, the +answer to 'whether the phone is heavy' has a variety of different viewpoints. A +satisfied answer should be able to summarize these subjective opinions from +multiple sources and provide objective knowledge, such as the weight of a +phone. That is quite different from the traditional QA task, in which the +answer to a factoid question is unique and can be found from a single data +source. To address this new task, we propose a three-steps method. We first +retrieve all answer-related clues from multiple knowledge sources on facts and +opinions. The implicit commonsense facts are also collected to supplement the +necessary but missing contexts. We then capture their relevance with the +questions by interactive attention. Next, we design a reinforcement-based +summarizer to aggregate all these knowledgeable clues. Based on a +template-controlled decoder, we can output a comprehensive and +multi-perspective answer. Due to the lack of a relevant evaluated benchmark set +for the new task, we construct a large-scale dataset, named SupQA, consisting +of 48,352 samples across 15 product domains. Evaluation results show the +effectiveness of our approach. + +
+
+
+
+
+ + ☆ Do PLMs Know and Understand Ontological Knowledge? ACL 2023 + + +
+ Ontological knowledge, which comprises classes and properties and their +relationships, is integral to world knowledge. It is significant to explore +whether Pretrained Language Models (PLMs) know and understand such knowledge. +However, existing PLM-probing studies focus mainly on factual knowledge, +lacking a systematic probing of ontological knowledge. In this paper, we focus +on probing whether PLMs store ontological knowledge and have a semantic +understanding of the knowledge rather than rote memorization of the surface +form. To probe whether PLMs know ontological knowledge, we investigate how well +PLMs memorize: (1) types of entities; (2) hierarchical relationships among +classes and properties, e.g., Person is a subclass of Animal and Member of +Sports Team is a subproperty of Member of ; (3) domain and range constraints of +properties, e.g., the subject of Member of Sports Team should be a Person and +the object should be a Sports Team. To further probe whether PLMs truly +understand ontological knowledge beyond memorization, we comprehensively study +whether they can reliably perform logical reasoning with given knowledge +according to ontological entailment rules. Our probing results show that PLMs +can memorize certain ontological knowledge and utilize implicit knowledge in +reasoning. However, both the memorizing and reasoning performances are less +than perfect, indicating incomplete knowledge and understanding. + +
+
+ comment: Accepted by ACL 2023 (Outstanding Paper Award) +
+
+
+
+
+ + ☆ A Survey of Hallucination in Large Foundation Models + + +
+ Hallucination in a foundation model (FM) refers to the generation of content +that strays from factual reality or includes fabricated information. This +survey paper provides an extensive overview of recent efforts that aim to +identify, elucidate, and tackle the problem of hallucination, with a particular +focus on ``Large'' Foundation Models (LFMs). The paper classifies various types +of hallucination phenomena that are specific to LFMs and establishes evaluation +criteria for assessing the extent of hallucination. It also examines existing +strategies for mitigating hallucination in LFMs and discusses potential +directions for future research in this area. Essentially, the paper offers a +comprehensive examination of the challenges and solutions related to +hallucination in LFMs. + +
+
+
+
+
+ + ☆ SAGE: Structured Attribute Value Generation for Billion-Scale Product + Catalogs + + +
+ We introduce SAGE; a Generative LLM for inferring attribute values for +products across world-wide e-Commerce catalogs. We introduce a novel +formulation of the attribute-value prediction problem as a Seq2Seq +summarization task, across languages, product types and target attributes. Our +novel modeling approach lifts the restriction of predicting attribute values +within a pre-specified set of choices, as well as, the requirement that the +sought attribute values need to be explicitly mentioned in the text. SAGE can +infer attribute values even when such values are mentioned implicitly using +periphrastic language, or not-at-all-as is the case for common-sense defaults. +Additionally, SAGE is capable of predicting whether an attribute is +inapplicable for the product at hand, or non-obtainable from the available +information. SAGE is the first method able to tackle all aspects of the +attribute-value-prediction task as they arise in practical settings in +e-Commerce catalogs. A comprehensive set of experiments demonstrates the +effectiveness of the proposed approach, as well as, its superiority against +state-of-the-art competing alternatives. Moreover, our experiments highlight +SAGE's ability to tackle the task of predicting attribute values in zero-shot +setting; thereby, opening up opportunities for significantly reducing the +overall number of labeled examples required for training. + +
+
+ comment: (17 pages) +
+
+
+
+
+ + ☆ Stochastic LLMs do not Understand Language: Towards Symbolic, + Explainable and Ontologically Based LLMs + + +
+ In our opinion the exuberance surrounding the relative success of data-driven +large language models (LLMs) is slightly misguided and for several reasons (i) +LLMs cannot be relied upon for factual information since for LLMs all ingested +text (factual or non-factual) was created equal; (ii) due to their subsymbolic +na-ture, whatever 'knowledge' these models acquire about language will always +be buried in billions of microfeatures (weights), none of which is meaningful +on its own; and (iii) LLMs will often fail to make the correct inferences in +several linguistic contexts (e.g., nominal compounds, copredication, quantifier +scope ambi-guities, intensional contexts. Since we believe the relative success +of data-driven large language models (LLMs) is not a reflection on the symbolic +vs. subsymbol-ic debate but a reflection on applying the successful strategy of +a bottom-up reverse engineering of language at scale, we suggest in this paper +applying the effective bottom-up strategy in a symbolic setting resulting in +symbolic, explainable, and ontologically grounded language models. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ RT-LM: Uncertainty-Aware Resource Management for Real-Time Inference of + Language Models + + +
+ Recent advancements in language models (LMs) have gained substantial +attentions on their capability to generate human-like responses. Though +exhibiting a promising future for various applications such as conversation AI, +these LMs face deployment challenges on various devices due to their extreme +computational cost and unpredictable inference latency. Such varied inference +latency, identified as a consequence of uncertainty intrinsic to the nature of +language, can lead to computational inefficiency and degrade the overall +performance of LMs, especially under high-traffic workloads. Unfortunately, the +bandwidth of these uncertainty sources is extensive, complicating the +prediction of latency and the effects emanating from such uncertainties. To +understand and mitigate the impact of uncertainty on real-time +response-demanding systems, we take the first step to comprehend, quantify and +optimize these uncertainty-induced latency performance variations in LMs. +Specifically, we present RT-LM, an uncertainty-aware resource management +ecosystem for real-time inference of LMs. RT-LM innovatively quantifies how +specific input uncertainties, adversely affect latency, often leading to an +increased output length. Exploiting these insights, we devise a lightweight yet +effective method to dynamically correlate input text uncertainties with output +length at runtime. Utilizing this quantification as a latency heuristic, we +integrate the uncertainty information into a system-level scheduler which +explores several uncertainty-induced optimization opportunities, including +uncertainty-aware prioritization, dynamic consolidation, and strategic CPU +offloading. Quantitative experiments across five state-of-the-art LMs on two +hardware platforms demonstrates that RT-LM can significantly reduce the average +response time and improve throughput while incurring a rather small runtime +overhead. + +
+
+ comment: Accepted by RTSS 2023 +
+
+
+
+
+ + ☆ Do Generative Large Language Models need billions of parameters? + + +
+ This paper presents novel systems and methodologies for the development of +efficient large language models (LLMs). It explores the trade-offs between +model size, performance, and computational resources, with the aim of +maximizing the efficiency of these AI systems. The research explores novel +methods that allow different parts of the model to share parameters, reducing +the total number of unique parameters required. This approach ensures that the +model remains compact without sacrificing its ability to learn and represent +complex language structures. This study provides valuable insights and tools +for creating more efficient and effective LLMs, contributing to a more +sustainable and accessible future for AI language modeling. + +
+
+
+
+
+ + ☆ Text Encoders Lack Knowledge: Leveraging Generative LLMs for + Domain-Specific Semantic Textual Similarity EMNLP-2023 + + +
+ Amidst the sharp rise in the evaluation of large language models (LLMs) on +various tasks, we find that semantic textual similarity (STS) has been +under-explored. In this study, we show that STS can be cast as a text +generation problem while maintaining strong performance on multiple STS +benchmarks. Additionally, we show generative LLMs significantly outperform +existing encoder-based STS models when characterizing the semantic similarity +between two texts with complex semantic relationships dependent on world +knowledge. We validate this claim by evaluating both generative LLMs and +existing encoder-based STS models on three newly collected STS challenge sets +which require world knowledge in the domains of Health, Politics, and Sports. +All newly collected data is sourced from social media content posted after May +2023 to ensure the performance of closed-source models like ChatGPT cannot be +credited to memorization. Our results show that, on average, generative LLMs +outperform the best encoder-only baselines by an average of 22.3% on STS tasks +requiring world knowledge. Our results suggest generative language models with +STS-specific prompting strategies achieve state-of-the-art performance in +complex, domain-specific STS tasks. + +
+
+ comment: Under review GEM@EMNLP-2023, 12 pages +
+
+
+
+
+ + ☆ Minimum Bayes' Risk Decoding for System Combination of Grammatical Error + Correction Systems + + +
+ For sequence-to-sequence tasks it is challenging to combine individual system +outputs. Further, there is also often a mismatch between the decoding criterion +and the one used for assessment. Minimum Bayes' Risk (MBR) decoding can be used +to combine system outputs in a manner that encourages better alignment with the +final assessment criterion. This paper examines MBR decoding for Grammatical +Error Correction (GEC) systems, where performance is usually evaluated in terms +of edits and an associated F-score. Hence, we propose a novel MBR loss function +directly linked to this form of criterion. Furthermore, an approach to expand +the possible set of candidate sentences is described. This builds on a current +max-voting combination scheme, as well as individual edit-level selection. +Experiments on three popular GEC datasets and with state-of-the-art GEC systems +demonstrate the efficacy of the proposed MBR approach. Additionally, the paper +highlights how varying reward metrics within the MBR decoding framework can +provide control over precision, recall, and the F-score in combined GEC +systems. + +
+
+
+
+
+ + ☆ Overview of Memotion 3: Sentiment and Emotion Analysis of Codemixed + Hinglish Memes AAAI 2023 + + +
+ Analyzing memes on the internet has emerged as a crucial endeavor due to the +impact this multi-modal form of content wields in shaping online discourse. +Memes have become a powerful tool for expressing emotions and sentiments, +possibly even spreading hate and misinformation, through humor and sarcasm. In +this paper, we present the overview of the Memotion 3 shared task, as part of +the DeFactify 2 workshop at AAAI-23. The task released an annotated dataset of +Hindi-English code-mixed memes based on their Sentiment (Task A), Emotion (Task +B), and Emotion intensity (Task C). Each of these is defined as an individual +task and the participants are ranked separately for each task. Over 50 teams +registered for the shared task and 5 made final submissions to the test set of +the Memotion 3 dataset. CLIP, BERT modifications, ViT etc. were the most +popular models among the participants along with approaches such as +Student-Teacher model, Fusion, and Ensembling. The best final F1 score for Task +A is 34.41, Task B is 79.77 and Task C is 59.82. + +
+
+ comment: Defactify2 @AAAI 2023 +
+
+
+
+
+ + ☆ Leveraging Large Language Models and Weak Supervision for Social Media + data annotation: an evaluation using COVID-19 self-reported vaccination + tweets + + +
+ The COVID-19 pandemic has presented significant challenges to the healthcare +industry and society as a whole. With the rapid development of COVID-19 +vaccines, social media platforms have become a popular medium for discussions +on vaccine-related topics. Identifying vaccine-related tweets and analyzing +them can provide valuable insights for public health research-ers and +policymakers. However, manual annotation of a large number of tweets is +time-consuming and expensive. In this study, we evaluate the usage of Large +Language Models, in this case GPT-4 (March 23 version), and weak supervision, +to identify COVID-19 vaccine-related tweets, with the purpose of comparing +performance against human annotators. We leveraged a manu-ally curated +gold-standard dataset and used GPT-4 to provide labels without any additional +fine-tuning or instructing, in a single-shot mode (no additional prompting). + +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Automated Dialogue Analysis SIGDIAL 2023 + + +
+ Developing high-performing dialogue systems benefits from the automatic +identification of undesirable behaviors in system responses. However, detecting +such behaviors remains challenging, as it draws on a breadth of general +knowledge and understanding of conversational practices. Although recent +research has focused on building specialized classifiers for detecting specific +dialogue behaviors, the behavior coverage is still incomplete and there is a +lack of testing on real-world human-bot interactions. This paper investigates +the ability of a state-of-the-art large language model (LLM), ChatGPT-3.5, to +perform dialogue behavior detection for nine categories in real human-bot +dialogues. We aim to assess whether ChatGPT can match specialized models and +approximate human performance, thereby reducing the cost of behavior detection +tasks. Our findings reveal that neither specialized models nor ChatGPT have yet +achieved satisfactory results for this task, falling short of human +performance. Nevertheless, ChatGPT shows promising potential and often +outperforms specialized detection models. We conclude with an in-depth +examination of the prevalent shortcomings of ChatGPT, offering guidance for +future research to enhance LLM capabilities. + +
+
+ comment: Accepted to SIGDIAL 2023 +
+
+
+
+
+ + ☆ Widely Interpretable Semantic Representation: Frameless Meaning + Representation for Broader Applicability + + +
+ This paper presents a novel semantic representation, WISeR, that overcomes +challenges for Abstract Meaning Representation (AMR). Despite its strengths, +AMR is not easily applied to languages or domains without predefined semantic +frames, and its use of numbered arguments results in semantic role labels, +which are not directly interpretable and are semantically overloaded for +parsers. We examine the numbered arguments of predicates in AMR and convert +them to thematic roles that do not require reference to semantic frames. We +create a new corpus of 1K English dialogue sentences annotated in both WISeR +and AMR. WISeR shows stronger inter-annotator agreement for beginner and +experienced annotators, with beginners becoming proficient in WISeR annotation +more quickly. Finally, we train a state-of-the-art parser on the AMR 3.0 corpus +and a WISeR corpus converted from AMR 3.0. The parser is evaluated on these +corpora and our dialogue corpus. The WISeR model exhibits higher accuracy than +its AMR counterpart across the board, demonstrating that WISeR is easier for +parsers to learn. + +
+
+
+
+
+ + ☆ Narrowing the Gap between Supervised and Unsupervised Sentence + Representation Learning with Large Language Model + + +
+ Sentence Representation Learning (SRL) is a fundamental task in Natural +Language Processing (NLP), with Contrastive learning of Sentence Embeddings +(CSE) as the mainstream technique due to its superior performance. An +intriguing phenomenon in CSE is the significant performance gap between +supervised and unsupervised methods, even when their sentence encoder and loss +function are the same. Previous works attribute this performance gap to +differences in two representation properties (alignment and uniformity). +However, alignment and uniformity only measure the results, which means they +cannot answer "What happens during the training process that leads to the +performance gap?" and "How can the performance gap be narrowed?". In this +paper, we conduct empirical experiments to answer these "What" and "How" +questions. We first answer the "What" question by thoroughly comparing the +behavior of supervised and unsupervised CSE during their respective training +processes. From the comparison, We observe a significant difference in fitting +difficulty. Thus, we introduce a metric, called Fitting Difficulty Increment +(FDI), to measure the fitting difficulty gap between the evaluation dataset and +the held-out training dataset, and use the metric to answer the "What" +question. Then, based on the insights gained from the "What" question, we +tackle the "How" question by increasing the fitting difficulty of the training +dataset. We achieve this by leveraging the In-Context Learning (ICL) capability +of the Large Language Model (LLM) to generate data that simulates complex +patterns. By utilizing the hierarchical patterns in the LLM-generated data, we +effectively narrow the gap between supervised and unsupervised CSE. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Exploring Large Language Models for Ontology Alignment ISWC 2023 + + +
+ This work investigates the applicability of recent generative Large Language +Models (LLMs), such as the GPT series and Flan-T5, to ontology alignment for +identifying concept equivalence mappings across ontologies. To test the +zero-shot performance of Flan-T5-XXL and GPT-3.5-turbo, we leverage challenging +subsets from two equivalence matching datasets of the OAEI Bio-ML track, taking +into account concept labels and structural contexts. Preliminary findings +suggest that LLMs have the potential to outperform existing ontology alignment +systems like BERTMap, given careful framework and prompt design. + +
+
+ comment: Accepted at ISWC 2023 (Posters and Demos) +
+
+
+
+
+ + ♻ ☆ Open-world Story Generation with Structured Knowledge Enhancement: A + Comprehensive Survey + + +
+ Storytelling and narrative are fundamental to human experience, intertwined +with our social and cultural engagement. As such, researchers have long +attempted to create systems that can generate stories automatically. In recent +years, powered by deep learning and massive data resources, automatic story +generation has shown significant advances. However, considerable challenges, +like the need for global coherence in generated stories, still hamper +generative models from reaching the same storytelling ability as human +narrators. To tackle these challenges, many studies seek to inject structured +knowledge into the generation process, which is referred to as structured +knowledge-enhanced story generation. Incorporating external knowledge can +enhance the logical coherence among story events, achieve better knowledge +grounding, and alleviate over-generalization and repetition problems in +stories. This survey provides the latest and comprehensive review of this +research field: (i) we present a systematic taxonomy regarding how existing +methods integrate structured knowledge into story generation; (ii) we summarize +involved story corpora, structured knowledge datasets, and evaluation metrics; +(iii) we give multidimensional insights into the challenges of +knowledge-enhanced story generation and cast light on promising directions for +future study. + +
+
+ comment: Accepted in Neurocomputing +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Exploiting ASR Uncertainty + + +
+ While large language models excel in a variety of natural language processing +(NLP) tasks, to perform well on spoken language understanding (SLU) tasks, they +must either rely on off-the-shelf automatic speech recognition (ASR) systems +for transcription, or be equipped with an in-built speech modality. This work +focuses on the former scenario, where LLM's accuracy on SLU tasks is +constrained by the accuracy of a fixed ASR system on the spoken input. +Specifically, we tackle speech-intent classification task, where a high +word-error-rate can limit the LLM's ability to understand the spoken intent. +Instead of chasing a high accuracy by designing complex or specialized +architectures regardless of deployment costs, we seek to answer how far we can +go without substantially changing the underlying ASR and LLM, which can +potentially be shared by multiple unrelated tasks. To this end, we propose +prompting the LLM with an n-best list of ASR hypotheses instead of only the +error-prone 1-best hypothesis. We explore prompt-engineering to explain the +concept of n-best lists to the LLM; followed by the finetuning of Low-Rank +Adapters on the downstream tasks. Our approach using n-best lists proves to be +effective on a device-directed speech detection task as well as on a keyword +spotting task, where systems using n-best list prompts outperform those using +1-best ASR hypothesis; thus paving the way for an efficient method to exploit +ASR uncertainty via LLMs for speech-based applications. + +
+
+ comment: Added references +
+
+
+
+
+ + ♻ ☆ Prompting Multilingual Large Language Models to Generate Code-Mixed + Texts: The Case of South East Asian Languages + + +
+ While code-mixing is a common linguistic practice in many parts of the world, +collecting high-quality and low-cost code-mixed data remains a challenge for +natural language processing (NLP) research. The recent proliferation of Large +Language Models (LLMs) compels one to ask: how capable are these systems in +generating code-mixed data? In this paper, we explore prompting multilingual +LLMs in a zero-shot manner to generate code-mixed data for seven languages in +South East Asia (SEA), namely Indonesian, Malay, Chinese, Tagalog, Vietnamese, +Tamil, and Singlish. We find that publicly available multilingual +instruction-tuned models such as BLOOMZ and Flan-T5-XXL are incapable of +producing texts with phrases or clauses from different languages. ChatGPT +exhibits inconsistent capabilities in generating code-mixed texts, wherein its +performance varies depending on the prompt template and language pairing. For +instance, ChatGPT generates fluent and natural Singlish texts (an English-based +creole spoken in Singapore), but for English-Tamil language pair, the system +mostly produces grammatically incorrect or semantically meaningless utterances. +Furthermore, it may erroneously introduce languages not specified in the +prompt. Based on our investigation, existing multilingual LLMs exhibit a wide +range of proficiency in code-mixed data generation for SEA languages. As such, +we advise against using LLMs in this context without extensive human checks. + +
+
+ comment: Updating Authors +
+
+
+
+
+ + ♻ ☆ Why Do We Need Neuro-symbolic AI to Model Pragmatic Analogies? + + +
+ A hallmark of intelligence is the ability to use a familiar domain to make +inferences about a less familiar domain, known as analogical reasoning. In this +article, we delve into the performance of Large Language Models (LLMs) in +dealing with progressively complex analogies expressed in unstructured text. We +discuss analogies at four distinct levels of complexity: lexical analogies, +syntactic analogies, semantic analogies, and pragmatic analogies. As the +analogies become more complex, they require increasingly extensive, diverse +knowledge beyond the textual content, unlikely to be found in the lexical +co-occurrence statistics that power LLMs. To address this, we discuss the +necessity of employing Neuro-symbolic AI techniques that combine statistical +and symbolic AI, informing the representation of unstructured text to highlight +and augment relevant content, provide abstraction and guide the mapping +process. Our knowledge-informed approach maintains the efficiency of LLMs while +preserving the ability to explain analogies for pedagogical applications. + +
+
+ comment: 12 pages 3 figures +
+
+
+
+
+ + ♻ ☆ ROSCOE: A Suite of Metrics for Scoring Step-by-Step Reasoning + + +
+ Large language models show improved downstream task performance when prompted +to generate step-by-step reasoning to justify their final answers. These +reasoning steps greatly improve model interpretability and verification, but +objectively studying their correctness (independent of the final answer) is +difficult without reliable methods for automatic evaluation. We simply do not +know how often the stated reasoning steps actually support the final end task +predictions. In this work, we present ROSCOE, a suite of interpretable, +unsupervised automatic scores that improve and extend previous text generation +evaluation metrics. To evaluate ROSCOE against baseline metrics, we design a +typology of reasoning errors and collect synthetic and human evaluation scores +on commonly used reasoning datasets. In contrast with existing metrics, ROSCOE +can measure semantic consistency, logicality, informativeness, fluency, and +factuality - among other traits - by leveraging properties of step-by-step +rationales. We empirically verify the strength of our metrics on five human +annotated and six programmatically perturbed diagnostics datasets - covering a +diverse set of tasks that require reasoning skills and show that ROSCOE can +consistently outperform baseline metrics. + +
+
+
+
+
+ + ♻ ☆ Testing the limits of natural language models for predicting human + language judgments + + +
+ Neural network language models can serve as computational hypotheses about +how humans process language. We compared the model-human consistency of diverse +language models using a novel experimental approach: controversial sentence +pairs. For each controversial sentence pair, two language models disagree about +which sentence is more likely to occur in natural text. Considering nine +language models (including n-gram, recurrent neural networks, and transformer +models), we created hundreds of such controversial sentence pairs by either +selecting sentences from a corpus or synthetically optimizing sentence pairs to +be highly controversial. Human subjects then provided judgments indicating for +each pair which of the two sentences is more likely. Controversial sentence +pairs proved highly effective at revealing model failures and identifying +models that aligned most closely with human judgments. The most +human-consistent model tested was GPT-2, although experiments also revealed +significant shortcomings of its alignment with human perception. + +
+
+
+
+
+ + ♻ ☆ FIAT: Fusing learning paradigms with Instruction-Accelerated Tuning + + +
+ Learning paradigms for large language models (LLMs) currently tend to fall +within either in-context learning (ICL) or full fine-tuning. Each of these +comes with their own trade-offs based on available data, model size, compute +cost, ease-of-use, and final quality with neither solution performing well +across-the-board. In this article, we first describe ICL and fine-tuning +paradigms in a way that highlights their natural connections. Based on these +connections, we propose a new learning paradigm called FIAT that fuses the best +of these paradigms together, enabling prompt-engineered instructions and +chain-of-thought reasoning with the very largest models while also using +similar methods to perform parameter updates on a modestly-sized LLM with +parameter-efficient tuning. We evaluate FIAT's effectiveness on a variety of +multilingual tasks and observe that FIAT performs better than both ICL and +fine-tuning at scales ranging from 100-10,000 training examples. We hope that +FIAT provides a practical way of harnessing the full potential of LLMs without +needing to make a hard choice between learning paradigms. + +
+
+
+
+
+ + ♻ ☆ RoDia: A New Dataset for Romanian Dialect Identification from Speech + + +
+ Dialect identification is a critical task in speech processing and language +technology, enhancing various applications such as speech recognition, speaker +verification, and many others. While most research studies have been dedicated +to dialect identification in widely spoken languages, limited attention has +been given to dialect identification in low-resource languages, such as +Romanian. To address this research gap, we introduce RoDia, the first dataset +for Romanian dialect identification from speech. The RoDia dataset includes a +varied compilation of speech samples from five distinct regions of Romania, +covering both urban and rural environments, totaling 2 hours of manually +annotated speech data. Along with our dataset, we introduce a set of +competitive models to be used as baselines for future research. The top scoring +model achieves a macro F1 score of 59.83% and a micro F1 score of 62.08%, +indicating that the task is challenging. We thus believe that RoDia is a +valuable resource that will stimulate research aiming to address the challenges +of Romanian dialect identification. We publicly release our dataset and code at +https://github.com/codrut2/RoDia. + +
+
+
+
+
+ + ♻ ☆ The CALLA Dataset: Probing LLMs' Interactive Knowledge Acquisition from + Chinese Medical Literature + + +
+ The application of Large Language Models (LLMs) to the medical domain has +stimulated the interest of researchers. Recent studies have focused on +constructing Instruction Fine-Tuning (IFT) data through medical knowledge +graphs to enrich the interactive medical knowledge of LLMs. However, the +medical literature serving as a rich source of medical knowledge remains +unexplored. Our work introduces the CALLA dataset to probe LLMs' interactive +knowledge acquisition from Chinese medical literature. It assesses the +proficiency of LLMs in mastering medical knowledge through a free-dialogue +fact-checking task. We identify a phenomenon called the ``fact-following +response``, where LLMs tend to affirm facts mentioned in questions and display +a reluctance to challenge them. To eliminate the inaccurate evaluation caused +by this phenomenon, for the golden fact, we artificially construct test data +from two perspectives: one consistent with the fact and one inconsistent with +the fact. Drawing from the probing experiment on the CALLA dataset, we conclude +that IFT data highly correlated with the medical literature corpus serves as a +potent catalyst for LLMs, enabling themselves to skillfully employ the medical +knowledge acquired during the pre-training phase within interactive scenarios, +enhancing accuracy. Furthermore, we design a framework for automatically +constructing IFT data based on medical literature and discuss some real-world +applications. + +
+
+
+
+
+ + ♻ ☆ Zero-Resource Hallucination Prevention for Large Language Models + + +
+ The prevalent use of large language models (LLMs) in various domains has +drawn attention to the issue of "hallucination," which refers to instances +where LLMs generate factually inaccurate or ungrounded information. Existing +techniques for hallucination detection in language assistants rely on intricate +fuzzy, specific free-language-based chain of thought (CoT) techniques or +parameter-based methods that suffer from interpretability issues. Additionally, +the methods that identify hallucinations post-generation could not prevent +their occurrence and suffer from inconsistent performance due to the influence +of the instruction format and model style. In this paper, we introduce a novel +pre-detection self-evaluation technique, referred to as SELF-FAMILIARITY, which +focuses on evaluating the model's familiarity with the concepts present in the +input instruction and withholding the generation of response in case of +unfamiliar concepts. This approach emulates the human ability to refrain from +responding to unfamiliar topics, thus reducing hallucinations. We validate +SELF-FAMILIARITY across four different large language models, demonstrating +consistently superior performance compared to existing techniques. Our findings +propose a significant shift towards preemptive strategies for hallucination +mitigation in LLM assistants, promising improvements in reliability, +applicability, and interpretability. + +
+
+
+
+
+ + ♻ ☆ An Empirical Study of NetOps Capability of Pre-Trained Large Language + Models + + +
+ Large language models (LLMs) can respond to human language queries and have +shown powerful potential applications in network operations (NetOps). Thanks to +the large amount of commonsense knowledge inherent, LLMs achieve much better +inference accuracy than traditional models and emerge with strong abilities in +generalization, reasoning, and code generation. These abilities may have a +crucial boost to automated and intelligent NetOps. However, it remains +under-explored how well LLMs perform in various NetOps tasks. In this work, we +make a systematic assessment of the capabilities, strengths, and limitations of +selected LLMs in the field of NetOps. The evaluation is conducted on a +collection of 5,732 questions about NetOps, encompassing 26 publicly available +general-domain LLMs, including ChatGPT, LLaMA, Falcon, etc. We also finetune +some of these LLMs with our collected NetOps corpus and evaluate the resulting +models. The evaluation method follows the widely adopted benchmarks for +general-domain LLMs, combined with Chain-of-Thought Prompts and +Retrieval-Augmented Generation. The results show that only GPT-4 achieves high +accuracy equivalent to passing the NetOps certification exam for humans, while +all the other LLMs have much lower accuracy. However, some open models like +LLaMA 2 still demonstrate significant potential. Furthermore, we evaluate the +impact of factors such as model parameters, prompt engineering, instruction +fine-tuning etc. This work shall be treated as the initial effort to systematic +evaluation of LLMs in NetOps, and a more rigorous study is required for +production use. The evaluation code and dataset will be released to benefit +future research. + +
+
+
+
+
+ + ♻ ☆ GPT Can Solve Mathematical Problems Without a Calculator + + +
+ Previous studies have typically assumed that large language models are unable +to accurately perform arithmetic operations, particularly multiplication of >8 +digits, and operations involving decimals and fractions, without the use of +calculator tools. This paper aims to challenge this misconception. With +sufficient training data, a 2 billion-parameter language model can accurately +perform multi-digit arithmetic operations with almost 100% accuracy without +data leakage, significantly surpassing GPT-4 (whose multi-digit multiplication +accuracy is only 4.3%). We also demonstrate that our MathGLM, fine-tuned from +GLM-10B on a dataset with additional multi-step arithmetic operations and math +problems described in text, achieves similar performance to GPT-4 on a +5,000-samples Chinese math problem test set. Our code and data are public at +https://github.com/THUDM/MathGLM. + +
+
+ comment: 26pages,14figures +
+
+
+
+
+ + ♻ ☆ UQ at #SMM4H 2023: ALEX for Public Health Analysis with Social Media + + +
+ As social media becomes increasingly popular, more and more activities +related to public health emerge. Current techniques for public health analysis +involve popular models such as BERT and large language models (LLMs). However, +the costs of training in-domain LLMs for public health are especially +expensive. Furthermore, such kinds of in-domain datasets from social media are +generally imbalanced. To tackle these challenges, the data imbalance issue can +be overcome by data augmentation and balanced training. Moreover, the ability +of the LLMs can be effectively utilized by prompting the model properly. In +this paper, a novel ALEX framework is proposed to improve the performance of +public health analysis on social media by adopting an LLMs explanation +mechanism. Results show that our ALEX model got the best performance among all +submissions in both Task 2 and Task 4 with a high score in Task 1 in Social +Media Mining for Health 2023 (SMM4H)[1]. Our code has been released at https:// +github.com/YanJiangJerry/ALEX. + +
+
+
+
+
+ + ♻ ☆ Speech Separation based on Contrastive Learning and Deep Modularization + + +
+ The current monaural state of the art tools for speech separation relies on +supervised learning. This means that they must deal with permutation problem, +they are impacted by the mismatch on the number of speakers used in training +and inference. Moreover, their performance heavily relies on the presence of +high-quality labelled data. These problems can be effectively addressed by +employing a fully unsupervised technique for speech separation. In this paper, +we use contrastive learning to establish the representations of frames then use +the learned representations in the downstream deep modularization task. +Concretely, we demonstrate experimentally that in speech separation, different +frames of a speaker can be viewed as augmentations of a given hidden standard +frame of that speaker. The frames of a speaker contain enough prosodic +information overlap which is key in speech separation. Based on this, we +implement a self-supervised learning to learn to minimize the distance between +frames belonging to a given speaker. The learned representations are used in a +downstream deep modularization task to cluster frames based on speaker +identity. Evaluation of the developed technique on WSJ0-2mix and WSJ0-3mix +shows that the technique attains SI-SNRi and SDRi of 20.8 and 21.0 respectively +in WSJ0-2mix. In WSJ0-3mix, it attains SI-SNRi and SDRi of 20.7 and 20.7 +respectively in WSJ0-2mix. Its greatest strength being that as the number of +speakers increase, its performance does not degrade significantly. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2212.00369 +
+
+
+
+
+ + ♻ ☆ Affective Visual Dialog: A Large-Scale Benchmark for Emotional Reasoning + Based on Visually Grounded Conversations + + +
+ We introduce Affective Visual Dialog, an emotion explanation and reasoning +task as a testbed for research on understanding the formation of emotions in +visually grounded conversations. The task involves three skills: (1) +Dialog-based Question Answering (2) Dialog-based Emotion Prediction and (3) +Affective emotion explanation generation based on the dialog. Our key +contribution is the collection of a large-scale dataset, dubbed AffectVisDial, +consisting of 50K 10-turn visually grounded dialogs as well as concluding +emotion attributions and dialog-informed textual emotion explanations, +resulting in a total of 27,180 working hours. We explain our design decisions +in collecting the dataset and introduce the questioner and answerer tasks that +are associated with the participants in the conversation. We train and +demonstrate solid Affective Visual Dialog baselines adapted from +state-of-the-art models. Remarkably, the responses generated by our models show +promising emotional reasoning abilities in response to visually grounded +conversations. Our project page is available at +https://affective-visual-dialog.github.io. + +
+
+
+
+
+ + ♻ ☆ Multi-document Summarization: A Comparative Evaluation + + +
+ This paper is aimed at evaluating state-of-the-art models for Multi-document +Summarization (MDS) on different types of datasets in various domains and +investigating the limitations of existing models to determine future research +directions. To address this gap, we conducted an extensive literature review to +identify state-of-the-art models and datasets. We analyzed the performance of +PRIMERA and PEGASUS models on BigSurvey-MDS and MS$^2$ datasets, which posed +unique challenges due to their varied domains. Our findings show that the +General-Purpose Pre-trained Model LED outperforms PRIMERA and PEGASUS on the +MS$^2$ dataset. We used the ROUGE score as a performance metric to evaluate the +identified models on different datasets. Our study provides valuable insights +into the models' strengths and weaknesses, as well as their applicability in +different domains. This work serves as a reference for future MDS research and +contributes to the development of accurate and robust models which can be +utilized on demanding datasets with academically and/or scientifically complex +data as well as generalized, relatively simple datasets. + +
+
+
+
+
+ + ♻ ☆ LLaSM: Large Language and Speech Model + + +
+ Multi-modal large language models have garnered significant interest +recently. Though, most of the works focus on vision-language multi-modal models +providing strong capabilities in following vision-and-language instructions. +However, we claim that speech is also an important modality through which +humans interact with the world. Hence, it is crucial for a general-purpose +assistant to be able to follow multi-modal speech-and-language instructions. In +this work, we propose Large Language and Speech Model (LLaSM). LLaSM is an +end-to-end trained large multi-modal speech-language model with cross-modal +conversational abilities, capable of following speech-and-language +instructions. Our early experiments show that LLaSM demonstrates a more +convenient and natural way for humans to interact with artificial intelligence. +Specifically, we also release a large Speech Instruction Following dataset +LLaSM-Audio-Instructions. Code and demo are available at +https://github.com/LinkSoul-AI/LLaSM and +https://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions +dataset is available at +https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions. + +
+
+
+
+
+ + ♻ ☆ Multi-Modality Multi-Loss Fusion Network + + +
+ In this work we investigate the optimal selection and fusion of features +across multiple modalities and combine these in a neural network to improve +emotion detection. We compare different fusion methods and examine the impact +of multi-loss training within the multi-modality fusion network, identifying +useful findings relating to subnet performance. Our best model achieves +state-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and +CH-SIMS), and outperforms the other methods in most metrics. We have found that +training on multimodal features improves single modality testing and designing +fusion methods based on dataset annotation schema enhances model performance. +These results suggest a roadmap towards an optimized feature selection and +fusion approach for enhancing emotion detection in neural networks. + +
+
+ comment: First two authors contributed equally to the paper +
+
+
+
+
+ + ♻ ☆ Galactic ChitChat: Using Large Language Models to Converse with + Astronomy Literature + + +
+ We demonstrate the potential of the state-of-the-art OpenAI GPT-4 large +language model to engage in meaningful interactions with Astronomy papers using +in-context prompting. To optimize for efficiency, we employ a distillation +technique that effectively reduces the size of the original input paper by +50\%, while maintaining the paragraph structure and overall semantic integrity. +We then explore the model's responses using a multi-document context (ten +distilled documents). Our findings indicate that GPT-4 excels in the +multi-document domain, providing detailed answers contextualized within the +framework of related research findings. Our results showcase the potential of +large language models for the astronomical community, offering a promising +avenue for further exploration, particularly the possibility of utilizing the +models for hypothesis generation. + +
+
+ comment: 3 pages, published in RNAAS +
+
+
+
+
+ + ♻ ☆ Learning to Select from Multiple Options AAAI 2023 + + +
+ Many NLP tasks can be regarded as a selection problem from a set of options, +such as classification tasks, multi-choice question answering, etc. Textual +entailment (TE) has been shown as the state-of-the-art (SOTA) approach to +dealing with those selection problems. TE treats input texts as premises (P), +options as hypotheses (H), then handles the selection problem by modeling (P, +H) pairwise. Two limitations: first, the pairwise modeling is unaware of other +options, which is less intuitive since humans often determine the best options +by comparing competing candidates; second, the inference process of pairwise TE +is time-consuming, especially when the option space is large. To deal with the +two issues, this work first proposes a contextualized TE model (Context-TE) by +appending other k options as the context of the current (P, H) modeling. +Context-TE is able to learn more reliable decision for the H since it considers +various context. Second, we speed up Context-TE by coming up with Parallel-TE, +which learns the decisions of multiple options simultaneously. Parallel-TE +significantly improves the inference speed while keeping comparable performance +with Context-TE. Our methods are evaluated on three tasks (ultra-fine entity +typing, intent detection and multi-choice QA) that are typical selection +problems with different sizes of options. Experiments show our models set new +SOTA performance; particularly, Parallel-TE is faster than the pairwise TE by k +times in inference. Our code is publicly available at +https://github.com/jiangshdd/LearningToSelect. + +
+
+ comment: Accepted by AAAI 2023 +
+
+
+
+
+ + ♻ ☆ Processing Natural Language on Embedded Devices: How Well Do Modern + Models Perform? + + +
+ Voice-controlled systems are becoming ubiquitous in many IoT-specific +applications such as home/industrial automation, automotive infotainment, and +healthcare. While cloud-based voice services (\eg Alexa, Siri) can leverage +high-performance computing servers, some use cases (\eg robotics, automotive +infotainment) may require to execute the natural language processing (NLP) +tasks offline, often on resource-constrained embedded devices. Large language +models such as BERT and its variants are primarily developed with compute-heavy +servers in mind. Despite the great performance of BERT models across various +NLP tasks, their large size and numerous parameters pose substantial obstacles +to offline computation on embedded systems. Lighter replacement of such +language models (\eg DistilBERT and TinyBERT) often sacrifice accuracy, +particularly for complex NLP tasks. Until now, it is still unclear \ca whether +the state-of-the-art language models, \viz BERT and its variants are deployable +on embedded systems with a limited processor, memory, and battery power and \cb +if they do, what are the ``right'' set of configurations and parameters to +choose for a given NLP task. This paper presents an \textit{exploratory study +of modern language models} under different resource constraints and accuracy +budgets to derive empirical observations about these resource/accuracy +trade-offs. In particular, we study how the four most commonly used BERT-based +language models (\eg BERT, RoBERTa, DistilBERT, and TinyBERT) perform on +embedded systems. We tested them on a Raspberry Pi-based robotic platform with +three hardware configurations and four datasets running various NLP tasks. Our +findings can help designers to understand the deployability and performance of +modern language models, especially those based on BERT architectures, thus +saving a lot of time wasted in trial-and-error efforts. + +
+
+
+
+
+ + ♻ ☆ Memory Injections: Correcting Multi-Hop Reasoning Failures during + Inference in Transformer-Based Language Models + + +
+ Answering multi-hop reasoning questions requires retrieving and synthesizing +information from diverse sources. Large Language Models (LLMs) struggle to +perform such reasoning consistently. Here we propose an approach to pinpoint +and rectify multi-hop reasoning failures through targeted memory injections on +LLM attention heads. First, we analyze the per-layer activations of GPT-2 +models in response to single and multi-hop prompts. We then propose a mechanism +that allows users to inject pertinent prompt-specific information, which we +refer to as "memories," at critical LLM locations during inference. By thus +enabling the LLM to incorporate additional relevant information during +inference, we enhance the quality of multi-hop prompt completions. We show +empirically that a simple, efficient, and targeted memory injection into a key +attention layer can often increase the probability of the desired next token in +multi-hop tasks, by up to 424%. + +
+
+
+
+
+ + ♻ ☆ Findings of Factify 2: Multimodal Fake News Detection AAAI 2023 + + +
+ With social media usage growing exponentially in the past few years, fake +news has also become extremely prevalent. The detrimental impact of fake news +emphasizes the need for research focused on automating the detection of false +information and verifying its accuracy. In this work, we present the outcome of +the Factify 2 shared task, which provides a multi-modal fact verification and +satire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data +calls for a comparison based approach to the task by pairing social media +claims with supporting documents, with both text and image, divided into 5 +classes based on multi-modal relations. In the second iteration of this task we +had over 60 participants and 9 final test-set submissions. The best +performances came from the use of DeBERTa for text and Swinv2 and CLIP for +image. The highest F1 score averaged for all five classes was 81.82%. + +
+
+ comment: Defactify2 @AAAI 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ Learning Disentangled Avatars with Hybrid 3D Representations + + +
+ Tremendous efforts have been made to learn animatable and photorealistic +human avatars. Towards this end, both explicit and implicit 3D representations +are heavily studied for a holistic modeling and capture of the whole human +(e.g., body, clothing, face and hair), but neither representation is an optimal +choice in terms of representation efficacy since different parts of the human +avatar have different modeling desiderata. For example, meshes are generally +not suitable for modeling clothing and hair. Motivated by this, we present +Disentangled Avatars~(DELTA), which models humans with hybrid explicit-implicit +3D representations. DELTA takes a monocular RGB video as input, and produces a +human avatar with separate body and clothing/hair layers. Specifically, we +demonstrate two important applications for DELTA. For the first one, we +consider the disentanglement of the human body and clothing and in the second, +we disentangle the face and hair. To do so, DELTA represents the body or face +with an explicit mesh-based parametric 3D model and the clothing or hair with +an implicit neural radiance field. To make this possible, we design an +end-to-end differentiable renderer that integrates meshes into volumetric +rendering, enabling DELTA to learn directly from monocular videos without any +3D supervision. Finally, we show that how these two applications can be easily +combined to model full-body avatars, such that the hair, face, body and +clothing can be fully disentangled yet jointly rendered. Such a disentanglement +enables hair and clothing transfer to arbitrary body shapes. We empirically +validate the effectiveness of DELTA's disentanglement by demonstrating its +promising performance on disentangled reconstruction, virtual clothing try-on +and hairstyle transfer. To facilitate future research, we also release an +open-sourced pipeline for the study of hybrid human avatar modeling. + +
+
+ comment: home page: https://yfeng95.github.io/delta. arXiv admin note: text + overlap with arXiv:2210.01868 +
+
+
+
+
+ + ☆ LEAP Hand: Low-Cost, Efficient, and Anthropomorphic Hand for Robot + Learning + + +
+ Dexterous manipulation has been a long-standing challenge in robotics. While +machine learning techniques have shown some promise, results have largely been +currently limited to simulation. This can be mostly attributed to the lack of +suitable hardware. In this paper, we present LEAP Hand, a low-cost dexterous +and anthropomorphic hand for machine learning research. In contrast to previous +hands, LEAP Hand has a novel kinematic structure that allows maximal dexterity +regardless of finger pose. LEAP Hand is low-cost and can be assembled in 4 +hours at a cost of 2000 USD from readily available parts. It is capable of +consistently exerting large torques over long durations of time. We show that +LEAP Hand can be used to perform several manipulation tasks in the real world +-- from visual teleoperation to learning from passive video data and sim2real. +LEAP Hand significantly outperforms its closest competitor Allegro Hand in all +our experiments while being 1/8th of the cost. We release detailed assembly +instructions, the Sim2Real pipeline and a development platform with useful APIs +on our website at https://leap-hand.github.io/ + +
+
+ comment: Website at https://leap-hand.github.io/ +
+
+
+
+
+ + ☆ Attention De-sparsification Matters: Inducing Diversity in Digital + Pathology Representation Learning + + +
+ We propose DiRL, a Diversity-inducing Representation Learning technique for +histopathology imaging. Self-supervised learning techniques, such as +contrastive and non-contrastive approaches, have been shown to learn rich and +effective representations of digitized tissue samples with limited pathologist +supervision. Our analysis of vanilla SSL-pretrained models' attention +distribution reveals an insightful observation: sparsity in attention, i.e, +models tends to localize most of their attention to some prominent patterns in +the image. Although attention sparsity can be beneficial in natural images due +to these prominent patterns being the object of interest itself, this can be +sub-optimal in digital pathology; this is because, unlike natural images, +digital pathology scans are not object-centric, but rather a complex phenotype +of various spatially intermixed biological components. Inadequate +diversification of attention in these complex images could result in crucial +information loss. To address this, we leverage cell segmentation to densely +extract multiple histopathology-specific representations, and then propose a +prior-guided dense pretext task for SSL, designed to match the multiple +corresponding representations between the views. Through this, the model learns +to attend to various components more closely and evenly, thus inducing adequate +diversification in attention for capturing context rich representations. +Through quantitative and qualitative analysis on multiple tasks across cancer +types, we demonstrate the efficacy of our method and observe that the attention +is more globally distributed. + +
+
+
+
+
+ + ☆ Exploring Non-additive Randomness on ViT against Query-Based Black-Box + Attacks BMVC2023 + + +
+ Deep Neural Networks can be easily fooled by small and imperceptible +perturbations. The query-based black-box attack (QBBA) is able to create the +perturbations using model output probabilities of image queries requiring no +access to the underlying models. QBBA poses realistic threats to real-world +applications. Recently, various types of robustness have been explored to +defend against QBBA. In this work, we first taxonomize the stochastic defense +strategies against QBBA. Following our taxonomy, we propose to explore +non-additive randomness in models to defend against QBBA. Specifically, we +focus on underexplored Vision Transformers based on their flexible +architectures. Extensive experiments show that the proposed defense approach +achieves effective defense, without much sacrifice in performance. + +
+
+ comment: Accepted to BMVC2023 +
+
+
+
+
+ + ☆ AGMDT: Virtual Staining of Renal Histology Images with Adjacency-Guided + Multi-Domain Transfer BMVC 2023 + + +
+ Renal pathology, as the gold standard of kidney disease diagnosis, requires +doctors to analyze a serial of tissue slices stained by H\&E staining and +special staining like Masson, PASM, and PAS, respectively. These special +staining methods are costly, time-consuming, and hard to standardize for wide +use especially in primary hospitals. Advances of supervised learning methods +can virtually convert H\&E images into special staining images, but the +pixel-to-pixel alignment is hard to achieve for training. As contrast, +unsupervised learning methods regarding different stains as different style +transferring domains can use unpaired data, but they ignore the spatial +inter-domain correlations and thus decrease the trustworthiness of structural +details for diagnosis. In this paper, we propose a novel virtual staining +framework AGMDT to translate images into other domains by avoiding pixel-level +alignment and meanwhile utilizing the correlations among adjacent tissue +slices. We first build a high-quality multi-domain renal histological dataset +where each specimen case comprises a series of slices stained in various ways. +Based on it, the proposed framework AGMDT discovers patch-level aligned pairs +across the serial slices of multi-domains through glomerulus detection and +bipartite graph matching, and utilizes such correlations to supervise the +end-to-end model for multi-domain staining transformation. Experimental results +show that the proposed AGMDT achieves a good balance between the precise +pixel-level alignment and unpaired domain transfer by exploiting correlations +across multi-domain serial pathological slices, and outperforms the +state-of-the-art methods in both quantitative measure and morphological +details. + +
+
+ comment: Accepted at BMVC 2023 +
+
+
+
+
+ + ☆ InstaFlow: One Step is Enough for High-Quality Diffusion-Based + Text-to-Image Generation + + +
+ Diffusion models have revolutionized text-to-image generation with its +exceptional quality and creativity. However, its multi-step sampling process is +known to be slow, often requiring tens of inference steps to obtain +satisfactory results. Previous attempts to improve its sampling speed and +reduce computational costs through distillation have been unsuccessful in +achieving a functional one-step model. In this paper, we explore a recent +method called Rectified Flow, which, thus far, has only been applied to small +datasets. The core of Rectified Flow lies in its \emph{reflow} procedure, which +straightens the trajectories of probability flows, refines the coupling between +noises and images, and facilitates the distillation process with student +models. We propose a novel text-conditioned pipeline to turn Stable Diffusion +(SD) into an ultra-fast one-step model, in which we find reflow plays a +critical role in improving the assignment between noise and images. Leveraging +our new pipeline, we create, to the best of our knowledge, the first one-step +diffusion-based text-to-image generator with SD-level image quality, achieving +an FID (Frechet Inception Distance) of $23.3$ on MS COCO 2017-5k, surpassing +the previous state-of-the-art technique, progressive distillation, by a +significant margin ($37.2$ $\rightarrow$ $23.3$ in FID). By utilizing an +expanded network with 1.7B parameters, we further improve the FID to $22.4$. We +call our one-step models \emph{InstaFlow}. On MS COCO 2014-30k, InstaFlow +yields an FID of $13.1$ in just $0.09$ second, the best in $\leq 0.1$ second +regime, outperforming the recent StyleGAN-T ($13.9$ in $0.1$ second). Notably, +the training of InstaFlow only costs 199 A100 GPU days. Project +page:~\url{https://github.com/gnobitab/InstaFlow}. + +
+
+
+
+
+ + ☆ Padding-free Convolution based on Preservation of Differential + Characteristics of Kernels + + +
+ Convolution is a fundamental operation in image processing and machine +learning. Aimed primarily at maintaining image size, padding is a key +ingredient of convolution, which, however, can introduce undesirable boundary +effects. We present a non-padding-based method for size-keeping convolution +based on the preservation of differential characteristics of kernels. The main +idea is to make convolution over an incomplete sliding window "collapse" to a +linear differential operator evaluated locally at its central pixel, which no +longer requires information from the neighbouring missing pixels. While the +underlying theory is rigorous, our final formula turns out to be simple: the +convolution over an incomplete window is achieved by convolving its nearest +complete window with a transformed kernel. This formula is computationally +lightweight, involving neither interpolation or extrapolation nor restrictions +on image and kernel sizes. Our method favours data with smooth boundaries, such +as high-resolution images and fields from physics. Our experiments include: i) +filtering analytical and non-analytical fields from computational physics and, +ii) training convolutional neural networks (CNNs) for the tasks of image +classification, semantic segmentation and super-resolution reconstruction. In +all these experiments, our method has exhibited visible superiority over the +compared ones. + +
+
+ comment: 8 pages, 3 figures, 1 table, ICLMA 2023 +
+
+
+
+
+ + ☆ Exploring Flat Minima for Domain Generalization with Large Learning + Rates + + +
+ Domain Generalization (DG) aims to generalize to arbitrary unseen domains. A +promising approach to improve model generalization in DG is the identification +of flat minima. One typical method for this task is SWAD, which involves +averaging weights along the training trajectory. However, the success of weight +averaging depends on the diversity of weights, which is limited when training +with a small learning rate. Instead, we observe that leveraging a large +learning rate can simultaneously promote weight diversity and facilitate the +identification of flat regions in the loss landscape. However, employing a +large learning rate suffers from the convergence problem, which cannot be +resolved by simply averaging the training weights. To address this issue, we +introduce a training strategy called Lookahead which involves the weight +interpolation, instead of average, between fast and slow weights. The fast +weight explores the weight space with a large learning rate, which is not +converged while the slow weight interpolates with it to ensure the convergence. +Besides, weight interpolation also helps identify flat minima by implicitly +optimizing the local entropy loss that measures flatness. To further prevent +overfitting during training, we propose two variants to regularize the training +weight with weighted averaged weight or with accumulated history weight. Taking +advantage of this new perspective, our methods achieve state-of-the-art +performance on both classification and semantic segmentation domain +generalization benchmarks. The code is available at +https://github.com/koncle/DG-with-Large-LR. + +
+
+
+
+
+ + ☆ Grounded Language Acquisition From Object and Action Imagery + + +
+ Deep learning approaches to natural language processing have made great +strides in recent years. While these models produce symbols that convey vast +amounts of diverse knowledge, it is unclear how such symbols are grounded in +data from the world. In this paper, we explore the development of a private +language for visual data representation by training emergent language (EL) +encoders/decoders in both i) a traditional referential game environment and ii) +a contrastive learning environment utilizing a within-class matching training +paradigm. An additional classification layer utilizing neural machine +translation and random forest classification was used to transform symbolic +representations (sequences of integer symbols) to class labels. These methods +were applied in two experiments focusing on object recognition and action +recognition. For object recognition, a set of sketches produced by human +participants from real imagery was used (Sketchy dataset) and for action +recognition, 2D trajectories were generated from 3D motion capture systems +(MOVI dataset). In order to interpret the symbols produced for data in each +experiment, gradient-weighted class activation mapping (Grad-CAM) methods were +used to identify pixel regions indicating semantic features which contribute +evidence towards symbols in learned languages. Additionally, a t-distributed +stochastic neighbor embedding (t-SNE) method was used to investigate embeddings +learned by CNN feature extractors. + +
+
+ comment: 9 pages, 7 figures, conference +
+
+
+
+
+ + ☆ SAMPLING: Scene-adaptive Hierarchical Multiplane Images Representation + for Novel View Synthesis from a Single Image + + +
+ Recent novel view synthesis methods obtain promising results for relatively +small scenes, e.g., indoor environments and scenes with a few objects, but tend +to fail for unbounded outdoor scenes with a single image as input. In this +paper, we introduce SAMPLING, a Scene-adaptive Hierarchical Multiplane Images +Representation for Novel View Synthesis from a Single Image based on improved +multiplane images (MPI). Observing that depth distribution varies significantly +for unbounded outdoor scenes, we employ an adaptive-bins strategy for MPI to +arrange planes in accordance with each scene image. To represent intricate +geometry and multi-scale details, we further introduce a hierarchical +refinement branch, which results in high-quality synthesized novel views. Our +method demonstrates considerable performance gains in synthesizing large-scale +unbounded outdoor scenes using a single image on the KITTI dataset and +generalizes well to the unseen Tanks and Temples dataset. The code and models +will be made public. + +
+
+
+
+
+ + ☆ Semantic and Articulated Pedestrian Sensing Onboard a Moving Vehicle + + +
+ It is difficult to perform 3D reconstruction from on-vehicle gathered video +due to the large forward motion of the vehicle. Even object detection and human +sensing models perform significantly worse on onboard videos when compared to +standard benchmarks because objects often appear far away from the camera +compared to the standard object detection benchmarks, image quality is often +decreased by motion blur and occlusions occur often. This has led to the +popularisation of traffic data-specific benchmarks. Recently Light Detection +And Ranging (LiDAR) sensors have become popular to directly estimate depths +without the need to perform 3D reconstructions. However, LiDAR-based methods +still lack in articulated human detection at a distance when compared to +image-based methods. We hypothesize that benchmarks targeted at articulated +human sensing from LiDAR data could bring about increased research in human +sensing and prediction in traffic and could lead to improved traffic safety for +pedestrians. + +
+
+
+
+
+ + ☆ AI4Food-NutritionFW: A Novel Framework for the Automatic Synthesis and + Analysis of Eating Behaviours + + +
+ Nowadays millions of images are shared on social media and web platforms. In +particular, many of them are food images taken from a smartphone over time, +providing information related to the individual's diet. On the other hand, +eating behaviours are directly related to some of the most prevalent diseases +in the world. Exploiting recent advances in image processing and Artificial +Intelligence (AI), this scenario represents an excellent opportunity to: i) +create new methods that analyse the individuals' health from what they eat, and +ii) develop personalised recommendations to improve nutrition and diet under +specific circumstances (e.g., obesity or COVID). Having tunable tools for +creating food image datasets that facilitate research in both lines is very +much needed. + This paper proposes AI4Food-NutritionFW, a framework for the creation of food +image datasets according to configurable eating behaviours. AI4Food-NutritionFW +simulates a user-friendly and widespread scenario where images are taken using +a smartphone. In addition to the framework, we also provide and describe a +unique food image dataset that includes 4,800 different weekly eating +behaviours from 15 different profiles and 1,200 subjects. Specifically, we +consider profiles that comply with actual lifestyles from healthy eating +behaviours (according to established knowledge), variable profiles (e.g., +eating out, holidays), to unhealthy ones (e.g., excess of fast food or sweets). +Finally, we automatically evaluate a healthy index of the subject's eating +behaviours using multidimensional metrics based on guidelines for healthy diets +proposed by international organisations, achieving promising results (99.53% +and 99.60% accuracy and sensitivity, respectively). We also release to the +research community a software implementation of our proposed +AI4Food-NutritionFW and the mentioned food image dataset created with it. + +
+
+ comment: 10 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Towards High-Quality Specular Highlight Removal by Leveraging + Large-Scale Synthetic Data + + +
+ This paper aims to remove specular highlights from a single object-level +image. Although previous methods have made some progresses, their performance +remains somewhat limited, particularly for real images with complex specular +highlights. To this end, we propose a three-stage network to address them. +Specifically, given an input image, we first decompose it into the albedo, +shading, and specular residue components to estimate a coarse specular-free +image. Then, we further refine the coarse result to alleviate its visual +artifacts such as color distortion. Finally, we adjust the tone of the refined +result to match that of the input as closely as possible. In addition, to +facilitate network training and quantitative evaluation, we present a +large-scale synthetic dataset of object-level images, covering diverse objects +and illumination conditions. Extensive experiments illustrate that our network +is able to generalize well to unseen real object-level images, and even produce +good results for scene-level images with multiple background objects and +complex lighting. + +
+
+
+
+
+ + ☆ Self-Training and Multi-Task Learning for Limited Data: Evaluation Study + on Object Detection ICCV + + +
+ Self-training allows a network to learn from the predictions of a more +complicated model, thus often requires well-trained teacher models and mixture +of teacher-student data while multi-task learning jointly optimizes different +targets to learn salient interrelationship and requires multi-task annotations +for each training example. These frameworks, despite being particularly data +demanding have potentials for data exploitation if such assumptions can be +relaxed. In this paper, we compare self-training object detection under the +deficiency of teacher training data where students are trained on unseen +examples by the teacher, and multi-task learning with partially annotated data, +i.e. single-task annotation per training example. Both scenarios have their own +limitation but potentially helpful with limited annotated data. Experimental +results show the improvement of performance when using a weak teacher with +unseen data for training a multi-task student. Despite the limited setup we +believe the experimental results show the potential of multi-task knowledge +distillation and self-training, which could be beneficial for future study. +Source code is at https://lhoangan.github.io/multas. + +
+
+ comment: Accepted for International Conference in Computer Vision workshop + (ICCVW) 2023 +
+
+
+
+
+ + ☆ Transferability analysis of data-driven additive manufacturing + knowledge: a case study between powder bed fusion and directed energy + deposition + + +
+ Data-driven research in Additive Manufacturing (AM) has gained significant +success in recent years. This has led to a plethora of scientific literature to +emerge. The knowledge in these works consists of AM and Artificial Intelligence +(AI) contexts that have not been mined and formalized in an integrated way. +Moreover, no tools or guidelines exist to support data-driven knowledge +transfer from one context to another. As a result, data-driven solutions using +specific AI techniques are being developed and validated only for specific AM +process technologies. There is a potential to exploit the inherent similarities +across various AM technologies and adapt the existing solutions from one +process or problem to another using AI, such as Transfer Learning. We propose a +three-step knowledge transferability analysis framework in AM to support +data-driven AM knowledge transfer. As a prerequisite to transferability +analysis, AM knowledge is featurized into identified knowledge components. The +framework consists of pre-transfer, transfer, and post-transfer steps to +accomplish knowledge transfer. A case study is conducted between flagship metal +AM processes. Laser Powder Bed Fusion (LPBF) is the source of knowledge +motivated by its relative matureness in applying AI over Directed Energy +Deposition (DED), which drives the need for knowledge transfer as the less +explored target process. We show successful transfer at different levels of the +data-driven solution, including data representation, model architecture, and +model parameters. The pipeline of AM knowledge transfer can be automated in the +future to allow efficient cross-context or cross-process knowledge exchange. + +
+
+ comment: 11 pages, 7 figures. This paper has been accepted to be published in + the proceedings of IDETC-CIE 2023 +
+
+
+
+
+ + ☆ Jersey Number Recognition using Keyframe Identification from + Low-Resolution Broadcast Videos + + +
+ Player identification is a crucial component in vision-driven soccer +analytics, enabling various downstream tasks such as player assessment, in-game +analysis, and broadcast production. However, automatically detecting jersey +numbers from player tracklets in videos presents challenges due to motion blur, +low resolution, distortions, and occlusions. Existing methods, utilizing +Spatial Transformer Networks, CNNs, and Vision Transformers, have shown success +in image data but struggle with real-world video data, where jersey numbers are +not visible in most of the frames. Hence, identifying frames that contain the +jersey number is a key sub-problem to tackle. To address these issues, we +propose a robust keyframe identification module that extracts frames containing +essential high-level information about the jersey number. A spatio-temporal +network is then employed to model spatial and temporal context and predict the +probabilities of jersey numbers in the video. Additionally, we adopt a +multi-task loss function to predict the probability distribution of each digit +separately. Extensive evaluations on the SoccerNet dataset demonstrate that +incorporating our proposed keyframe identification module results in a +significant 37.81% and 37.70% increase in the accuracies of 2 different test +sets with domain gaps. These results highlight the effectiveness and importance +of our approach in tackling the challenges of automatic jersey number detection +in sports videos. + +
+
+ comment: Accepted in the 6th International Workshop on Multimedia Content + Analysis in Sports (MMSports'23) @ ACM Multimedia +
+
+
+
+
+ + ☆ Fg-T2M: Fine-Grained Text-Driven Human Motion Generation via Diffusion + Model + + +
+ Text-driven human motion generation in computer vision is both significant +and challenging. However, current methods are limited to producing either +deterministic or imprecise motion sequences, failing to effectively control the +temporal and spatial relationships required to conform to a given text +description. In this work, we propose a fine-grained method for generating +high-quality, conditional human motion sequences supporting precise text +description. Our approach consists of two key components: 1) a +linguistics-structure assisted module that constructs accurate and complete +language feature to fully utilize text information; and 2) a context-aware +progressive reasoning module that learns neighborhood and overall semantic +linguistics features from shallow and deep graph neural networks to achieve a +multi-step inference. Experiments show that our approach outperforms +text-driven motion generation methods on HumanML3D and KIT test sets and +generates better visually confirmed motion to the text conditions. + +
+
+
+
+
+ + ☆ IBAFormer: Intra-batch Attention Transformer for Domain Generalized + Semantic Segmentation + + +
+ Domain generalized semantic segmentation (DGSS) is a critical yet challenging +task, where the model is trained only on source data without access to any +target data. Despite the proposal of numerous DGSS strategies, the +generalization capability remains limited in CNN architectures. Though some +Transformer-based segmentation models show promising performance, they +primarily focus on capturing intra-sample attentive relationships, disregarding +inter-sample correlations which can potentially benefit DGSS. To this end, we +enhance the attention modules in Transformer networks for improving DGSS by +incorporating information from other independent samples in the same batch, +enriching contextual information, and diversifying the training data for each +attention block. Specifically, we propose two alternative intra-batch attention +mechanisms, namely mean-based intra-batch attention (MIBA) and element-wise +intra-batch attention (EIBA), to capture correlations between different +samples, enhancing feature representation and generalization capabilities. +Building upon intra-batch attention, we introduce IBAFormer, which integrates +self-attention modules with the proposed intra-batch attention for DGSS. +Extensive experiments demonstrate that IBAFormer achieves SOTA performance in +DGSS, and ablation studies further confirm the effectiveness of each introduced +component. + +
+
+
+
+
+ + ☆ OTAS: Unsupervised Boundary Detection for Object-Centric Temporal Action + Segmentation WACV 2024 + + +
+ Temporal action segmentation is typically achieved by discovering the +dramatic variances in global visual descriptors. In this paper, we explore the +merits of local features by proposing the unsupervised framework of +Object-centric Temporal Action Segmentation (OTAS). Broadly speaking, OTAS +consists of self-supervised global and local feature extraction modules as well +as a boundary selection module that fuses the features and detects salient +boundaries for action segmentation. As a second contribution, we discuss the +pros and cons of existing frame-level and boundary-level evaluation metrics. +Through extensive experiments, we find OTAS is superior to the previous +state-of-the-art method by $41\%$ on average in terms of our recommended F1 +score. Surprisingly, OTAS even outperforms the ground-truth human annotations +in the user study. Moreover, OTAS is efficient enough to allow real-time +inference. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ☆ Modality Unifying Network for Visible-Infrared Person Re-Identification ICCV2023 + + +
+ Visible-infrared person re-identification (VI-ReID) is a challenging task due +to large cross-modality discrepancies and intra-class variations. Existing +methods mainly focus on learning modality-shared representations by embedding +different modalities into the same feature space. As a result, the learned +feature emphasizes the common patterns across modalities while suppressing +modality-specific and identity-aware information that is valuable for Re-ID. To +address these issues, we propose a novel Modality Unifying Network (MUN) to +explore a robust auxiliary modality for VI-ReID. First, the auxiliary modality +is generated by combining the proposed cross-modality learner and +intra-modality learner, which can dynamically model the modality-specific and +modality-shared representations to alleviate both cross-modality and +intra-modality variations. Second, by aligning identity centres across the +three modalities, an identity alignment loss function is proposed to discover +the discriminative feature representations. Third, a modality alignment loss is +introduced to consistently reduce the distribution distance of visible and +infrared images by modality prototype modeling. Extensive experiments on +multiple public datasets demonstrate that the proposed method surpasses the +current state-of-the-art methods by a significant margin. + +
+
+ comment: 11 pages, 5 figures. Accepted as the poster paper in ICCV2023 +
+
+
+
+
+ + ☆ Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation + + +
+ One primary topic of multi-modal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multi-modal cooperation, which could not jointly +utilize all modalities well. Some methods are proposed to identify and enhance +the worse learnt modality, but are often hard to provide the fine-grained +observation of multi-modal cooperation at sample-level with theoretical +support. Hence, it is essential to reasonably observe and improve the +fine-grained cooperation between modalities, especially when facing realistic +scenarios where the modality discrepancy could vary across different samples. +To this end, we introduce a fine-grained modality valuation metric to evaluate +the contribution of each modality at sample-level. Via modality valuation, we +regretfully observe that the multi-modal model tends to rely on one specific +modality, resulting in other modalities being low-contributing. We further +analyze this issue and improve cooperation between modalities by enhancing the +discriminative ability of low-contributing modalities in a targeted manner. +Overall, our methods reasonably observe the fine-grained uni-modal contribution +at sample-level and achieve considerable improvement on different multi-modal +models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Use neural networks to recognize students' handwritten letters and + incorrect symbols + + +
+ Correcting students' multiple-choice answers is a repetitive and mechanical +task that can be considered an image multi-classification task. Assuming +possible options are 'abcd' and the correct option is one of the four, some +students may write incorrect symbols or options that do not exist. In this +paper, five classifications were set up - four for possible correct options and +one for other incorrect writing. This approach takes into account the +possibility of non-standard writing options. + +
+
+
+
+
+ + ☆ Human Action Co-occurrence in Lifestyle Vlogs using Graph Link + Prediction + + +
+ We introduce the task of automatic human action co-occurrence identification, +i.e., determine whether two human actions can co-occur in the same interval of +time. We create and make publicly available the ACE (Action Co-occurrencE) +dataset, consisting of a large graph of ~12k co-occurring pairs of visual +actions and their corresponding video clips. We describe graph link prediction +models that leverage visual and textual information to automatically infer if +two actions are co-occurring. We show that graphs are particularly well suited +to capture relations between human actions, and the learned graph +representations are effective for our task and capture novel and relevant +information across different data domains. The ACE dataset and the code +introduced in this paper are publicly available at +https://github.com/MichiganNLP/vlog_action_co-occurrence. + +
+
+
+
+
+ + ☆ SGFeat: Salient Geometric Feature for Point Cloud Registration + + +
+ Point Cloud Registration (PCR) is a critical and challenging task in computer +vision. One of the primary difficulties in PCR is identifying salient and +meaningful points that exhibit consistent semantic and geometric properties +across different scans. Previous methods have encountered challenges with +ambiguous matching due to the similarity among patch blocks throughout the +entire point cloud and the lack of consideration for efficient global geometric +consistency. To address these issues, we propose a new framework that includes +several novel techniques. Firstly, we introduce a semantic-aware geometric +encoder that combines object-level and patch-level semantic information. This +encoder significantly improves registration recall by reducing ambiguity in +patch-level superpoint matching. Additionally, we incorporate a prior knowledge +approach that utilizes an intrinsic shape signature to identify salient points. +This enables us to extract the most salient super points and meaningful dense +points in the scene. Secondly, we introduce an innovative transformer that +encodes High-Order (HO) geometric features. These features are crucial for +identifying salient points within initial overlap regions while considering +global high-order geometric consistency. To optimize this high-order +transformer further, we introduce an anchor node selection strategy. By +encoding inter-frame triangle or polyhedron consistency features based on these +anchor nodes, we can effectively learn high-order geometric features of salient +super points. These high-order features are then propagated to dense points and +utilized by a Sinkhorn matching module to identify key correspondences for +successful registration. In our experiments conducted on well-known datasets +such as 3DMatch/3DLoMatch and KITTI, our approach has shown promising results, +highlighting the effectiveness of our novel method. + +
+
+
+
+
+ + ☆ Fast Sparse PCA via Positive Semidefinite Projection for Unsupervised + Feature Selection + + +
+ In the field of unsupervised feature selection, sparse principal component +analysis (SPCA) methods have attracted more and more attention recently. +Compared to spectral-based methods, SPCA methods don't rely on the construction +of a similarity matrix and show better feature selection ability on real-world +data. The original SPCA formulates a nonconvex optimization problem. Existing +convex SPCA methods reformulate SPCA as a convex model by regarding the +reconstruction matrix as an optimization variable. However, they are lack of +constraints equivalent to the orthogonality restriction in SPCA, leading to +larger solution space. In this paper, it's proved that the optimal solution to +a convex SPCA model falls onto the Positive Semidefinite (PSD) cone. A standard +convex SPCA-based model with PSD constraint for unsupervised feature selection +is proposed. Further, a two-step fast optimization algorithm via PSD projection +is presented to solve the proposed model. Two other existing convex SPCA-based +models are also proven to have their solutions optimized on the PSD cone in +this paper. Therefore, the PSD versions of these two models are proposed to +accelerate their convergence as well. We also provide a regularization +parameter setting strategy for our proposed method. Experiments on synthetic +and real-world datasets demonstrate the effectiveness and efficiency of the +proposed methods. + +
+
+
+
+
+ + ☆ SCP: Scene Completion Pre-training for 3D Object Detection SP + + +
+ 3D object detection using LiDAR point clouds is a fundamental task in the +fields of computer vision, robotics, and autonomous driving. However, existing +3D detectors heavily rely on annotated datasets, which are both time-consuming +and prone to errors during the process of labeling 3D bounding boxes. In this +paper, we propose a Scene Completion Pre-training (SCP) method to enhance the +performance of 3D object detectors with less labeled data. SCP offers three key +advantages: (1) Improved initialization of the point cloud model. By completing +the scene point clouds, SCP effectively captures the spatial and semantic +relationships among objects within urban environments. (2) Elimination of the +need for additional datasets. SCP serves as a valuable auxiliary network that +does not impose any additional efforts or data requirements on the 3D +detectors. (3) Reduction of the amount of labeled data for detection. With the +help of SCP, the existing state-of-the-art 3D detectors can achieve comparable +performance while only relying on 20% labeled data. + +
+
+ comment: Wins the best paper award at ISPRS Geospatial Week 2023 +
+
+
+
+
+ + ☆ 360$^\circ$ from a Single Camera: A Few-Shot Approach for LiDAR + Segmentation ICCV + + +
+ Deep learning applications on LiDAR data suffer from a strong domain gap when +applied to different sensors or tasks. In order for these methods to obtain +similar accuracy on different data in comparison to values reported on public +benchmarks, a large scale annotated dataset is necessary. However, in practical +applications labeled data is costly and time consuming to obtain. Such factors +have triggered various research in label-efficient methods, but a large gap +remains to their fully-supervised counterparts. Thus, we propose ImageTo360, an +effective and streamlined few-shot approach to label-efficient LiDAR +segmentation. Our method utilizes an image teacher network to generate semantic +predictions for LiDAR data within a single camera view. The teacher is used to +pretrain the LiDAR segmentation student network, prior to optional fine-tuning +on 360$^\circ$ data. Our method is implemented in a modular manner on the point +level and as such is generalizable to different architectures. We improve over +the current state-of-the-art results for label-efficient methods and even +surpass some traditional fully-supervised segmentation networks. + +
+
+ comment: ICCV Workshop 2023 +
+
+
+
+
+ + ☆ A 3M-Hybrid Model for the Restoration of Unique Giant Murals: A Case + Study on the Murals of Yongle Palace + + +
+ The Yongle Palace murals, as valuable cultural heritage, have suffered +varying degrees of damage, making their restoration of significant importance. +However, the giant size and unique data of Yongle Palace murals present +challenges for existing deep-learning based restoration methods: 1) The +distinctive style introduces domain bias in traditional transfer learning-based +restoration methods, while the scarcity of mural data further limits the +applicability of these methods. 2) Additionally, the giant size of these murals +results in a wider range of defect types and sizes, necessitating models with +greater adaptability. Consequently, there is a lack of focus on deep +learning-based restoration methods for the unique giant murals of Yongle +Palace. Here, a 3M-Hybrid model is proposed to address these challenges. +Firstly, based on the characteristic that the mural data frequency is prominent +in the distribution of low and high frequency features, high and low frequency +features are separately abstracted for complementary learning. Furthermore, we +integrate a pre-trained Vision Transformer model (VIT) into the CNN module, +allowing us to leverage the benefits of a large model while mitigating domain +bias. Secondly, we mitigate seam and structural distortion issues resulting +from the restoration of large defects by employing a multi-scale and +multi-perspective strategy, including data segmentation and fusion. +Experimental results demonstrate the efficacy of our proposed model. In +regular-sized mural restoration, it improves SSIM and PSNR by 14.61% and 4.73%, +respectively, compared to the best model among four representative CNN models. +Additionally, it achieves favorable results in the final restoration of giant +murals. + +
+
+
+
+
+ + ☆ Computer Vision Pipeline for Automated Antarctic Krill Analysis BMVC 2023 + + +
+ British Antarctic Survey (BAS) researchers launch annual expeditions to the +Antarctic in order to estimate Antarctic Krill biomass and assess the change +from previous years. These comparisons provide insight into the effects of the +current environment on this key component of the marine food chain. In this +work we have developed tools for automating the data collection and analysis +process, using web-based image annotation tools and deep learning image +classification and regression models. We achieve highly accurate krill instance +segmentation results with an average 77.28% AP score, as well as separate +maturity stage and length estimation of krill specimens with 62.99% accuracy +and a 1.96 mm length error respectively. + +
+
+ comment: Submitted to MVEO 2023 @ BMVC 2023 +
+
+
+
+
+ + ☆ Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding + + +
+ Make-up temporal video grounding (MTVG) aims to localize the target video +segment which is semantically related to a sentence describing a make-up +activity, given a long video. Compared with the general video grounding task, +MTVG focuses on meticulous actions and changes on the face. The make-up +instruction step, usually involving detailed differences in products and facial +areas, is more fine-grained than general activities (e.g, cooking activity and +furniture assembly). Thus, existing general approaches cannot locate the target +activity effectually. More specifically, existing proposal generation modules +are not yet fully developed in providing semantic cues for the more +fine-grained make-up semantic comprehension. To tackle this issue, we propose +an effective proposal-based framework named Dual-Path Temporal Map Optimization +Network (DPTMO) to capture fine-grained multimodal semantic details of make-up +activities. DPTMO extracts both query-agnostic and query-guided features to +construct two proposal sets and uses specific evaluation methods for the two +sets. Different from the commonly used single structure in previous methods, +our dual-path structure can mine more semantic information in make-up videos +and distinguish fine-grained actions well. These two candidate sets represent +the cross-modal makeup video-text similarity and multi-modal fusion +relationship, complementing each other. Each set corresponds to its respective +optimization perspective, and their joint prediction enhances the accuracy of +video timestamp prediction. Comprehensive experiments on the YouMakeup dataset +demonstrate our proposed dual structure excels in fine-grained semantic +comprehension. + +
+
+
+
+
+ + ☆ Elucidating the solution space of extended reverse-time SDE for + diffusion models + + +
+ Diffusion models (DMs) demonstrate potent image generation capabilities in +various generative modeling tasks. Nevertheless, their primary limitation lies +in slow sampling speed, requiring hundreds or thousands of sequential function +evaluations through large neural networks to generate high-quality images. +Sampling from DMs can be seen as solving corresponding stochastic differential +equations (SDEs) or ordinary differential equations (ODEs). In this work, we +formulate the sampling process as an extended reverse-time SDE (ER SDE), +unifying prior explorations into ODEs and SDEs. Leveraging the semi-linear +structure of ER SDE solutions, we offer exact solutions and arbitrarily +high-order approximate solutions for VP SDE and VE SDE, respectively. Based on +the solution space of the ER SDE, we yield mathematical insights elucidating +the superior performance of ODE solvers over SDE solvers in terms of fast +sampling. Additionally, we unveil that VP SDE solvers stand on par with their +VE SDE counterparts. Finally, we devise fast and training-free samplers, ER-SDE +Solvers, elevating the efficiency of stochastic samplers to unprecedented +levels. Experimental results demonstrate achieving 3.45 FID in 20 function +evaluations and 2.24 FID in 50 function evaluations on the ImageNet +64$\times$64 dataset. + +
+
+
+
+
+ + ☆ Certified Robust Models with Slack Control and Large Lipschitz Constants + + +
+ Despite recent success, state-of-the-art learning-based models remain highly +vulnerable to input changes such as adversarial examples. In order to obtain +certifiable robustness against such perturbations, recent work considers +Lipschitz-based regularizers or constraints while at the same time increasing +prediction margin. Unfortunately, this comes at the cost of significantly +decreased accuracy. In this paper, we propose a Calibrated Lipschitz-Margin +Loss (CLL) that addresses this issue and improves certified robustness by +tackling two problems: Firstly, commonly used margin losses do not adjust the +penalties to the shrinking output distribution; caused by minimizing the +Lipschitz constant $K$. Secondly, and most importantly, we observe that +minimization of $K$ can lead to overly smooth decision functions. This limits +the model's complexity and thus reduces accuracy. Our CLL addresses these +issues by explicitly calibrating the loss w.r.t. margin and Lipschitz constant, +thereby establishing full control over slack and improving robustness +certificates even with larger Lipschitz constants. On CIFAR-10, CIFAR-100 and +Tiny-ImageNet, our models consistently outperform losses that leave the +constant unattended. On CIFAR-100 and Tiny-ImageNet, CLL improves upon +state-of-the-art deterministic $L_2$ robust accuracies. In contrast to current +trends, we unlock potential of much smaller models without $K=1$ constraints. + +
+
+ comment: To be published at GCPR 2023 +
+
+
+
+
+ + ☆ Active Label Refinement for Semantic Segmentation of Satellite Images + + +
+ Remote sensing through semantic segmentation of satellite images contributes +to the understanding and utilisation of the earth's surface. For this purpose, +semantic segmentation networks are typically trained on large sets of labelled +satellite images. However, obtaining expert labels for these images is costly. +Therefore, we propose to rely on a low-cost approach, e.g. crowdsourcing or +pretrained networks, to label the images in the first step. Since these initial +labels are partially erroneous, we use active learning strategies to +cost-efficiently refine the labels in the second step. We evaluate the active +learning strategies using satellite images of Bengaluru in India, labelled with +land cover and land use labels. Our experimental results suggest that an active +label refinement to improve the semantic segmentation network's performance is +beneficial. + +
+
+
+
+
+ + ☆ Improving Generalization Capability of Deep Learning-Based Nuclei + Instance Segmentation by Non-deterministic Train Time and Deterministic Test + Time Stain Normalization + + +
+ With the advent of digital pathology and microscopic systems that can scan +and save whole slide histological images automatically, there is a growing +trend to use computerized methods to analyze acquired images. Among different +histopathological image analysis tasks, nuclei instance segmentation plays a +fundamental role in a wide range of clinical and research applications. While +many semi- and fully-automatic computerized methods have been proposed for +nuclei instance segmentation, deep learning (DL)-based approaches have been +shown to deliver the best performances. However, the performance of such +approaches usually degrades when tested on unseen datasets. + In this work, we propose a novel approach to improve the generalization +capability of a DL-based automatic segmentation approach. Besides utilizing one +of the state-of-the-art DL-based models as a baseline, our method incorporates +non-deterministic train time and deterministic test time stain normalization. +We trained the model with one single training set and evaluated its +segmentation performance on seven test datasets. Our results show that the +proposed method provides up to 5.77%, 5.36%, and 5.27% better performance in +segmenting nuclei based on Dice score, aggregated Jaccard index, and panoptic +quality score, respectively, compared to the baseline segmentation model. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Towards Reliable Domain Generalization: A New Dataset and Evaluations + + +
+ There are ubiquitous distribution shifts in the real world. However, deep +neural networks (DNNs) are easily biased towards the training set, which causes +severe performance degradation when they receive out-of-distribution data. Many +methods are studied to train models that generalize under various distribution +shifts in the literature of domain generalization (DG). However, the recent +DomainBed and WILDS benchmarks challenged the effectiveness of these methods. +Aiming at the problems in the existing research, we propose a new domain +generalization task for handwritten Chinese character recognition (HCCR) to +enrich the application scenarios of DG method research. We evaluate eighteen DG +methods on the proposed PaHCC (Printed and Handwritten Chinese Characters) +dataset and show that the performance of existing methods on this dataset is +still unsatisfactory. Besides, under a designed dynamic DG setting, we reveal +more properties of DG methods and argue that only the leave-one-domain-out +protocol is unreliable. We advocate that researchers in the DG community refer +to dynamic performance of methods for more comprehensive and reliable +evaluation. Our dataset and evaluations bring new perspectives to the community +for more substantial progress. We will make our dataset public with the article +published to facilitate the study of domain generalization. + +
+
+
+
+
+ + ☆ Prompting4Debugging: Red-Teaming Text-to-Image Diffusion Models by + Finding Problematic Prompts + + +
+ Text-to-image diffusion models, e.g. Stable Diffusion (SD), lately have shown +remarkable ability in high-quality content generation, and become one of the +representatives for the recent wave of transformative AI. Nevertheless, such +advance comes with an intensifying concern about the misuse of this generative +technology, especially for producing copyrighted or NSFW (i.e. not safe for +work) images. Although efforts have been made to filter inappropriate +images/prompts or remove undesirable concepts/styles via model fine-tuning, the +reliability of these safety mechanisms against diversified problematic prompts +remains largely unexplored. In this work, we propose Prompting4Debugging (P4D) +as a debugging and red-teaming tool that automatically finds problematic +prompts for diffusion models to test the reliability of a deployed safety +mechanism. We demonstrate the efficacy of our P4D tool in uncovering new +vulnerabilities of SD models with safety mechanisms. Particularly, our result +shows that around half of prompts in existing safe prompting benchmarks which +were originally considered "safe" can actually be manipulated to bypass many +deployed safety mechanisms, including concept removal, negative prompt, and +safety guidance. Our findings suggest that, without comprehensive testing, the +evaluations on limited safe prompting benchmarks can lead to a false sense of +safety for text-to-image models. + +
+
+
+
+
+ + ☆ JOADAA: joint online action detection and action anticipation + + +
+ Action anticipation involves forecasting future actions by connecting past +events to future ones. However, this reasoning ignores the real-life hierarchy +of events which is considered to be composed of three main parts: past, +present, and future. We argue that considering these three main parts and their +dependencies could improve performance. On the other hand, online action +detection is the task of predicting actions in a streaming manner. In this +case, one has access only to the past and present information. Therefore, in +online action detection (OAD) the existing approaches miss semantics or future +information which limits their performance. To sum up, for both of these tasks, +the complete set of knowledge (past-present-future) is missing, which makes it +challenging to infer action dependencies, therefore having low performances. To +address this limitation, we propose to fuse both tasks into a single uniform +architecture. By combining action anticipation and online action detection, our +approach can cover the missing dependencies of future information in online +action detection. This method referred to as JOADAA, presents a uniform model +that jointly performs action anticipation and online action detection. We +validate our proposed model on three challenging datasets: THUMOS'14, which is +a sparsely annotated dataset with one action per time step, CHARADES, and +Multi-THUMOS, two densely annotated datasets with more complex scenarios. +JOADAA achieves SOTA results on these benchmarks for both tasks. + +
+
+
+
+
+ + ☆ LEyes: A Lightweight Framework for Deep Learning-Based Eye Tracking + using Synthetic Eye Images + + +
+ Deep learning has bolstered gaze estimation techniques, but real-world +deployment has been impeded by inadequate training datasets. This problem is +exacerbated by both hardware-induced variations in eye images and inherent +biological differences across the recorded participants, leading to both +feature and pixel-level variance that hinders the generalizability of models +trained on specific datasets. While synthetic datasets can be a solution, their +creation is both time and resource-intensive. To address this problem, we +present a framework called Light Eyes or "LEyes" which, unlike conventional +photorealistic methods, only models key image features required for video-based +eye tracking using simple light distributions. LEyes facilitates easy +configuration for training neural networks across diverse gaze-estimation +tasks. We demonstrate that models trained using LEyes outperform other +state-of-the-art algorithms in terms of pupil and CR localization across +well-known datasets. In addition, a LEyes trained model outperforms the +industry standard eye tracker using significantly more cost-effective hardware. +Going forward, we are confident that LEyes will revolutionize synthetic data +generation for gaze estimation models, and lead to significant improvements of +the next generation video-based eye trackers. + +
+
+ comment: 31 pages, 8 figures +
+
+
+
+
+ + ☆ Dynamic Visual Prompt Tuning for Parameter Efficient Transfer Learning + + +
+ Parameter efficient transfer learning (PETL) is an emerging research spot +that aims to adapt large-scale pre-trained models to downstream tasks. Recent +advances have achieved great success in saving storage and computation costs. +However, these methods do not take into account instance-specific visual clues +for visual tasks. In this paper, we propose a Dynamic Visual Prompt Tuning +framework (DVPT), which can generate a dynamic instance-wise token for each +image. In this way, it can capture the unique visual feature of each image, +which can be more suitable for downstream visual tasks. We designed a Meta-Net +module that can generate learnable prompts based on each image, thereby +capturing dynamic instance-wise visual features. Extensive experiments on a +wide range of downstream recognition tasks show that DVPT achieves superior +performance than other PETL methods. More importantly, DVPT even outperforms +full fine-tuning on 17 out of 19 downstream tasks while maintaining high +parameter efficiency. Our code will be released soon. + +
+
+ comment: accepted by 2023 PRCV +
+
+
+
+
+ + ☆ C-RITNet: Set Infrared and Visible Image Fusion Free from Complementary + Information Mining + + +
+ Infrared and visible image fusion (IVIF) aims to extract and integrate the +complementary information in two different modalities to generate high-quality +fused images with salient targets and abundant texture details. However, +current image fusion methods go to great lengths to excavate complementary +features, which is generally achieved through two efforts. On the one hand, the +feature extraction network is expected to have excellent performance in +extracting complementary information. On the other hand, complex fusion +strategies are often designed to aggregate the complementary information. In +other words, enabling the network to perceive and extract complementary +information is extremely challenging. Complicated fusion strategies, while +effective, still run the risk of losing weak edge details. To this end, this +paper rethinks the IVIF outside the box, proposing a complementary-redundant +information transfer network (C-RITNet). It reasonably transfers complementary +information into redundant one, which integrates both the shared and +complementary features from two modalities. Hence, the proposed method is able +to alleviate the challenges posed by the complementary information extraction +and reduce the reliance on sophisticated fusion strategies. Specifically, to +skillfully sidestep aggregating complementary information in IVIF, we first +design the mutual information transfer (MIT) module to mutually represent +features from two modalities, roughly transferring complementary information +into redundant one. Then, a redundant information acquisition supervised by +source image (RIASSI) module is devised to further ensure the +complementary-redundant information transfer after MIT. Meanwhile, we also +propose a structure information preservation (SIP) module to guarantee that the +edge structure information of the source images can be transferred to the +fusion results. + +
+
+
+
+
+ + ☆ HOC-Search: Efficient CAD Model and Pose Retrieval from RGB-D Scans + + +
+ We present an automated and efficient approach for retrieving high-quality +CAD models of objects and their poses in a scene captured by a moving RGB-D +camera. We first investigate various objective functions to measure similarity +between a candidate CAD object model and the available data, and the best +objective function appears to be a "render-and-compare" method comparing depth +and mask rendering. We thus introduce a fast-search method that approximates an +exhaustive search based on this objective function for simultaneously +retrieving the object category, a CAD model, and the pose of an object given an +approximate 3D bounding box. This method involves a search tree that organizes +the CAD models and object properties including object category and pose for +fast retrieval and an algorithm inspired by Monte Carlo Tree Search, that +efficiently searches this tree. We show that this method retrieves CAD models +that fit the real objects very well, with a speed-up factor of 10x to 120x +compared to exhaustive search. + +
+
+
+
+
+ + ☆ Towards Visual Taxonomy Expansion + + +
+ Taxonomy expansion task is essential in organizing the ever-increasing volume +of new concepts into existing taxonomies. Most existing methods focus +exclusively on using textual semantics, leading to an inability to generalize +to unseen terms and the "Prototypical Hypernym Problem." In this paper, we +propose Visual Taxonomy Expansion (VTE), introducing visual features into the +taxonomy expansion task. We propose a textual hypernymy learning task and a +visual prototype learning task to cluster textual and visual semantics. In +addition to the tasks on respective modalities, we introduce a hyper-proto +constraint that integrates textual and visual semantics to produce fine-grained +visual semantics. Our method is evaluated on two datasets, where we obtain +compelling results. Specifically, on the Chinese taxonomy dataset, our method +significantly improves accuracy by 8.75 %. Additionally, our approach performs +better than ChatGPT on the Chinese taxonomy dataset. + +
+
+ comment: ACMMM accepted paper +
+
+
+
+
+ + ☆ Can we predict the Most Replayed data of video streaming platforms? ICCV 2023 + + +
+ Predicting which specific parts of a video users will replay is important for +several applications, including targeted advertisement placement on video +platforms and assisting video creators. In this work, we explore whether it is +possible to predict the Most Replayed (MR) data from YouTube videos. To this +end, we curate a large video benchmark, the YTMR500 dataset, which comprises +500 YouTube videos with MR data annotations. We evaluate Deep Learning (DL) +models of varying complexity on our dataset and perform an extensive ablation +study. In addition, we conduct a user study to estimate the human performance +on MR data prediction. Our results show that, although by a narrow margin, all +the evaluated DL models outperform random predictions. Additionally, they +exceed human-level accuracy. This suggests that predicting the MR data is a +difficult task that can be enhanced through the assistance of DL. Finally, we +believe that DL performance on MR data prediction can be further improved, for +example, by using multi-modal learning. We encourage the research community to +use our benchmark dataset to further investigate automatic MR data prediction. + +
+
+ comment: Accepted Extended Abstract at ICCV 2023 Workshop on AI for Creative + Video Editing and Understanding +
+
+
+
+
+ + ☆ Estimating exercise-induced fatigue from thermal facial images + + +
+ Exercise-induced fatigue resulting from physical activity can be an early +indicator of overtraining, illness, or other health issues. In this article, we +present an automated method for estimating exercise-induced fatigue levels +through the use of thermal imaging and facial analysis techniques utilizing +deep learning models. Leveraging a novel dataset comprising over 400,000 +thermal facial images of rested and fatigued users, our results suggest that +exercise-induced fatigue levels could be predicted with only one static thermal +frame with an average error smaller than 15\%. The results emphasize the +viability of using thermal imaging in conjunction with deep learning for +reliable exercise-induced fatigue estimation. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Plasticity-Optimized Complementary Networks for Unsupervised Continual + Learning WACV2024 + + +
+ Continuous unsupervised representation learning (CURL) research has greatly +benefited from improvements in self-supervised learning (SSL) techniques. As a +result, existing CURL methods using SSL can learn high-quality representations +without any labels, but with a notable performance drop when learning on a +many-tasks data stream. We hypothesize that this is caused by the +regularization losses that are imposed to prevent forgetting, leading to a +suboptimal plasticity-stability trade-off: they either do not adapt fully to +the incoming data (low plasticity), or incur significant forgetting when +allowed to fully adapt to a new SSL pretext-task (low stability). In this work, +we propose to train an expert network that is relieved of the duty of keeping +the previous knowledge and can focus on performing optimally on the new tasks +(optimizing plasticity). In the second phase, we combine this new knowledge +with the previous network in an adaptation-retrospection phase to avoid +forgetting and initialize a new expert with the knowledge of the old network. +We perform several experiments showing that our proposed approach outperforms +other CURL exemplar-free methods in few- and many-task split settings. +Furthermore, we show how to adapt our approach to semi-supervised continual +learning (Semi-SCL) and show that we surpass the accuracy of other +exemplar-free Semi-SCL methods and reach the results of some others that use +exemplars. + +
+
+ comment: Accepted at WACV2024 +
+
+
+
+
+ + ☆ A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel + Segmentation via Two-Phase Training Angiography-to-Venography Translation + + +
+ We present a semi-supervised domain adaptation framework for brain vessel +segmentation from different image modalities. Existing state-of-the-art methods +focus on a single modality, despite the wide range of available cerebrovascular +imaging techniques. This can lead to significant distribution shifts that +negatively impact the generalization across modalities. By relying on annotated +angiographies and a limited number of annotated venographies, our framework +accomplishes image-to-image translation and semantic segmentation, leveraging a +disentangled and semantically rich latent space to represent heterogeneous data +and perform image-level adaptation from source to target domains. Moreover, we +reduce the typical complexity of cycle-based architectures and minimize the use +of adversarial training, which allows us to build an efficient and intuitive +model with stable training. We evaluate our method on magnetic resonance +angiographies and venographies. While achieving state-of-the-art performance in +the source domain, our method attains a Dice score coefficient in the target +domain that is only 8.9% lower, highlighting its promising potential for robust +cerebrovascular image segmentation across different modalities. + +
+
+
+
+
+ + ☆ Batch Implicit Neural Representation for MRI Parallel Reconstruction + + +
+ Magnetic resonance imaging (MRI) always suffered from the problem of long +acquisition time. MRI reconstruction is one solution to reduce scan time by +skipping certain phase-encoding lines and then restoring high-quality images +from undersampled measurements. Recently, implicit neural representation (INR) +has emerged as a new deep learning method that represents an object as a +continuous function of spatial coordinates, and this function is normally +parameterized by a multilayer perceptron (MLP). In this paper, we propose a +novel MRI reconstruction method based on INR, which represents the +fully-sampled images as the function of pixel coordinates and prior feature +vectors of undersampled images for overcoming the generalization problem of +INR. Specifically, we introduce a scale-embedded encoder to produce +scale-independent pixel-specific features from MR images with different +undersampled scales and then concatenate with coordinates vectors to recover +fully-sampled MR images via an MLP, thus achieving arbitrary scale +reconstruction. The performance of the proposed method was assessed by +experimenting on publicly available MRI datasets and compared with other +reconstruction methods. Our quantitative evaluation demonstrates the +superiority of the proposed method over alternative reconstruction methods. + +
+
+
+
+
+ + ☆ Selection of contributing factors for predicting landslide + susceptibility using machine learning and deep learning models + + +
+ Landslides are a common natural disaster that can cause casualties, property +safety threats and economic losses. Therefore, it is important to understand or +predict the probability of landslide occurrence at potentially risky sites. A +commonly used means is to carry out a landslide susceptibility assessment based +on a landslide inventory and a set of landslide contributing factors. This can +be readily achieved using machine learning (ML) models such as logistic +regression (LR), support vector machine (SVM), random forest (RF), extreme +gradient boosting (Xgboost), or deep learning (DL) models such as convolutional +neural network (CNN) and long short time memory (LSTM). As the input data for +these models, landslide contributing factors have varying influences on +landslide occurrence. Therefore, it is logically feasible to select more +important contributing factors and eliminate less relevant ones, with the aim +of increasing the prediction accuracy of these models. However, selecting more +important factors is still a challenging task and there is no generally +accepted method. Furthermore, the effects of factor selection using various +methods on the prediction accuracy of ML and DL models are unclear. In this +study, the impact of the selection of contributing factors on the accuracy of +landslide susceptibility predictions using ML and DL models was investigated. +Four methods for selecting contributing factors were considered for all the +aforementioned ML and DL models, which included Information Gain Ratio (IGR), +Recursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least +Absolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization +(HHO). In addition, autoencoder-based factor selection methods for DL models +were also investigated. To assess their performances, an exhaustive approach +was adopted,... + +
+
+ comment: Stochastic Environmental Research and Risk Assessment +
+
+
+
+
+ + ☆ How does representation impact in-context learning: A exploration on a + synthetic task + + +
+ In-context learning, i.e., learning from in-context samples, is an impressive +ability of Transformer. However, the mechanism driving the in-context learning +is not yet fully understood. In this study, we aim to investigate from an +underexplored perspective of representation learning. The representation is +more complex for in-context learning senario, where the representation can be +impacted by both model weights and in-context samples. We refer the above two +conceptually aspects of representation as in-weight component and in-context +component, respectively. To study how the two components affect in-context +learning capabilities, we construct a novel synthetic task, making it possible +to device two probes, in-weights probe and in-context probe, to evaluate the +two components, respectively. We demonstrate that the goodness of in-context +component is highly related to the in-context learning performance, which +indicates the entanglement between in-context learning and representation +learning. Furthermore, we find that a good in-weights component can actually +benefit the learning of the in-context component, indicating that in-weights +learning should be the foundation of in-context learning. To further understand +the the in-context learning mechanism and importance of the in-weights +component, we proof by construction that a simple Transformer, which uses +pattern matching and copy-past mechanism to perform in-context learning, can +match the in-context learning performance with more complex, best tuned +Transformer under the perfect in-weights component assumption. In short, those +discoveries from representation learning perspective shed light on new +approaches to improve the in-context capacity. + +
+
+
+
+
+ + ☆ Real-Time Semantic Segmentation: A Brief Survey & Comparative Study in + Remote Sensing + + +
+ Real-time semantic segmentation of remote sensing imagery is a challenging +task that requires a trade-off between effectiveness and efficiency. It has +many applications including tracking forest fires, detecting changes in land +use and land cover, crop health monitoring, and so on. With the success of +efficient deep learning methods (i.e., efficient deep neural networks) for +real-time semantic segmentation in computer vision, researchers have adopted +these efficient deep neural networks in remote sensing image analysis. This +paper begins with a summary of the fundamental compression methods for +designing efficient deep neural networks and provides a brief but comprehensive +survey, outlining the recent developments in real-time semantic segmentation of +remote sensing imagery. We examine several seminal efficient deep learning +methods, placing them in a taxonomy based on the network architecture design +approach. Furthermore, we evaluate the quality and efficiency of some existing +efficient deep neural networks on a publicly available remote sensing semantic +segmentation benchmark dataset, the OpenEarthMap. The experimental results of +an extensive comparative study demonstrate that most of the existing efficient +deep neural networks have good segmentation quality, but they suffer low +inference speed (i.e., high latency rate), which may limit their capability of +deployment in real-time applications of remote sensing image segmentation. We +provide some insights into the current trend and future research directions for +real-time semantic segmentation of remote sensing imagery. + +
+
+ comment: Submitted to IEEE GRSM +
+
+
+
+
+ + ☆ BatMan-CLR: Making Few-shots Meta-Learners Resilient Against Label Noise + + +
+ The negative impact of label noise is well studied in classical supervised +learning yet remains an open research question in meta-learning. Meta-learners +aim to adapt to unseen learning tasks by learning a good initial model in +meta-training and consecutively fine-tuning it according to new tasks during +meta-testing. In this paper, we present the first extensive analysis of the +impact of varying levels of label noise on the performance of state-of-the-art +meta-learners, specifically gradient-based $N$-way $K$-shot learners. We show +that the accuracy of Reptile, iMAML, and foMAML drops by up to 42% on the +Omniglot and CifarFS datasets when meta-training is affected by label noise. To +strengthen the resilience against label noise, we propose two sampling +techniques, namely manifold (Man) and batch manifold (BatMan), which transform +the noisy supervised learners into semi-supervised ones to increase the utility +of noisy labels. We first construct manifold samples of $N$-way +$2$-contrastive-shot tasks through augmentation, learning the embedding via a +contrastive loss in meta-training, and then perform classification through +zeroing on the embedding in meta-testing. We show that our approach can +effectively mitigate the impact of meta-training label noise. Even with 60% +wrong labels \batman and \man can limit the meta-testing accuracy drop to +${2.5}$, ${9.4}$, ${1.1}$ percent points, respectively, with existing +meta-learners across the Omniglot, CifarFS, and MiniImagenet datasets. + +
+
+ comment: 10 pages,3 figures +
+
+
+
+
+ + ☆ Federated Learning for Large-Scale Scene Modeling with Neural Radiance + Fields + + +
+ We envision a system to continuously build and maintain a map based on +earth-scale neural radiance fields (NeRF) using data collected from vehicles +and drones in a lifelong learning manner. However, existing large-scale +modeling by NeRF has problems in terms of scalability and maintainability when +modeling earth-scale environments. Therefore, to address these problems, we +propose a federated learning pipeline for large-scale modeling with NeRF. We +tailor the model aggregation pipeline in federated learning for NeRF, thereby +allowing local updates of NeRF. In the aggregation step, the accuracy of the +clients' global pose is critical. Thus, we also propose global pose alignment +to align the noisy global pose of clients before the aggregation step. In +experiments, we show the effectiveness of the proposed pose alignment and the +federated learning pipeline on the large-scale scene dataset, Mill19. + +
+
+
+
+
+ + ☆ A new meteor detection application robust to camera movements + + +
+ This article presents a new tool for the automatic detection of meteors. Fast +Meteor Detection Toolbox (FMDT) is able to detect meteor sightings by analyzing +videos acquired by cameras onboard weather balloons or within airplane with +stabilization. The challenge consists in designing a processing chain composed +of simple algorithms, that are robust to the high fluctuation of the videos and +that satisfy the constraints on power consumption (10 W) and real-time +processing (25 frames per second). + +
+
+ comment: in French language, Groupe de Recherche et d'{\'E}tudes de Traitement + du Signal et des Images (GRETSI), Aug 2023, Grenoble, France +
+
+
+
+
+ + ☆ Learning from History: Task-agnostic Model Contrastive Learning for + Image Restoration + + +
+ Contrastive learning has emerged as a prevailing paradigm for high-level +vision tasks, which, by introducing properly negative samples, has also been +exploited for low-level vision tasks to achieve a compact optimization space to +account for their ill-posed nature. However, existing methods rely on manually +predefined, task-oriented negatives, which often exhibit pronounced +task-specific biases. In this paper, we propose a innovative approach for the +adaptive generation of negative samples directly from the target model itself, +called ``learning from history``. We introduce the Self-Prior guided Negative +loss for image restoration (SPNIR) to enable this approach. Our approach is +task-agnostic and generic, making it compatible with any existing image +restoration method or task. We demonstrate the effectiveness of our approach by +retraining existing models with SPNIR. The results show significant +improvements in image restoration across various tasks and architectures. For +example, models retrained with SPNIR outperform the original FFANet and +DehazeFormer by 3.41 dB and 0.57 dB on the RESIDE indoor dataset for image +dehazing. Similarly, they achieve notable improvements of 0.47 dB on SPA-Data +over IDT for image deraining and 0.12 dB on Manga109 for a 4x scale +super-resolution over lightweight SwinIR, respectively. Code and retrained +models are available at +https://github.com/Aitical/Task-agnostic_Model_Contrastive_Learning_Image_Restoration. + +
+
+
+
+
+ + ☆ Feature Aggregation Network for Building Extraction from High-resolution + Remote Sensing Images + + +
+ The rapid advancement in high-resolution satellite remote sensing data +acquisition, particularly those achieving submeter precision, has uncovered the +potential for detailed extraction of surface architectural features. However, +the diversity and complexity of surface distributions frequently lead to +current methods focusing exclusively on localized information of surface +features. This often results in significant intraclass variability in boundary +recognition and between buildings. Therefore, the task of fine-grained +extraction of surface features from high-resolution satellite imagery has +emerged as a critical challenge in remote sensing image processing. In this +work, we propose the Feature Aggregation Network (FANet), concentrating on +extracting both global and local features, thereby enabling the refined +extraction of landmark buildings from high-resolution satellite remote sensing +imagery. The Pyramid Vision Transformer captures these global features, which +are subsequently refined by the Feature Aggregation Module and merged into a +cohesive representation by the Difference Elimination Module. In addition, to +ensure a comprehensive feature map, we have incorporated the Receptive Field +Block and Dual Attention Module, expanding the receptive field and intensifying +attention across spatial and channel dimensions. Extensive experiments on +multiple datasets have validated the outstanding capability of FANet in +extracting features from high-resolution satellite images. This signifies a +major breakthrough in the field of remote sensing image processing. We will +release our code soon. + +
+
+
+
+
+ + ☆ SoccerNet 2023 Challenges Results + + +
+ The SoccerNet 2023 challenges were the third annual video understanding +challenges organized by the SoccerNet team. For this third edition, the +challenges were composed of seven vision-based tasks split into three main +themes. The first theme, broadcast video understanding, is composed of three +high-level tasks related to describing events occurring in the video +broadcasts: (1) action spotting, focusing on retrieving all timestamps related +to global actions in soccer, (2) ball action spotting, focusing on retrieving +all timestamps related to the soccer ball change of state, and (3) dense video +captioning, focusing on describing the broadcast with natural language and +anchored timestamps. The second theme, field understanding, relates to the +single task of (4) camera calibration, focusing on retrieving the intrinsic and +extrinsic camera parameters from images. The third and last theme, player +understanding, is composed of three low-level tasks related to extracting +information about the players: (5) re-identification, focusing on retrieving +the same players across multiple views, (6) multiple object tracking, focusing +on tracking players and the ball through unedited video streams, and (7) jersey +number recognition, focusing on recognizing the jersey number of players from +tracklets. Compared to the previous editions of the SoccerNet challenges, tasks +(2-3-7) are novel, including new annotations and data, task (4) was enhanced +with more data and annotations, and task (6) now focuses on end-to-end +approaches. More information on the tasks, challenges, and leaderboards are +available on https://www.soccer-net.org. Baselines and development kits can be +found on https://github.com/SoccerNet. + +
+
+
+
+
+ + ☆ TSSAT: Two-Stage Statistics-Aware Transformation for Artistic Style + Transfer ACM MM 2023 + + +
+ Artistic style transfer aims to create new artistic images by rendering a +given photograph with the target artistic style. Existing methods learn styles +simply based on global statistics or local patches, lacking careful +consideration of the drawing process in practice. Consequently, the stylization +results either fail to capture abundant and diversified local style patterns, +or contain undesired semantic information of the style image and deviate from +the global style distribution. To address this issue, we imitate the drawing +process of humans and propose a Two-Stage Statistics-Aware Transformation +(TSSAT) module, which first builds the global style foundation by aligning the +global statistics of content and style features and then further enriches local +style details by swapping the local statistics (instead of local features) in a +patch-wise manner, significantly improving the stylization effects. Moreover, +to further enhance both content and style representations, we introduce two +novel losses: an attention-based content loss and a patch-based style loss, +where the former enables better content preservation by enforcing the semantic +relation in the content image to be retained during stylization, and the latter +focuses on increasing the local style similarity between the style and stylized +images. Extensive qualitative and quantitative experiments verify the +effectiveness of our method. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ ATTA: Anomaly-aware Test-Time Adaptation for Out-of-Distribution + Detection in Segmentation + + +
+ Recent advancements in dense out-of-distribution (OOD) detection have +primarily focused on scenarios where the training and testing datasets share a +similar domain, with the assumption that no domain shift exists between them. +However, in real-world situations, domain shift often exits and significantly +affects the accuracy of existing out-of-distribution (OOD) detection models. In +this work, we propose a dual-level OOD detection framework to handle domain +shift and semantic shift jointly. The first level distinguishes whether domain +shift exists in the image by leveraging global low-level features, while the +second level identifies pixels with semantic shift by utilizing dense +high-level feature maps. In this way, we can selectively adapt the model to +unseen domains as well as enhance model's capacity in detecting novel classes. +We validate the efficacy of our proposed method on several OOD segmentation +benchmarks, including those with significant domain shifts and those without, +observing consistent performance improvements across various baseline models. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ FLDNet: A Foreground-Aware Network for Polyp Segmentation Leveraging + Long-Distance Dependencies + + +
+ Given the close association between colorectal cancer and polyps, the +diagnosis and identification of colorectal polyps play a critical role in the +detection and surgical intervention of colorectal cancer. In this context, the +automatic detection and segmentation of polyps from various colonoscopy images +has emerged as a significant problem that has attracted broad attention. +Current polyp segmentation techniques face several challenges: firstly, polyps +vary in size, texture, color, and pattern; secondly, the boundaries between +polyps and mucosa are usually blurred, existing studies have focused on +learning the local features of polyps while ignoring the long-range +dependencies of the features, and also ignoring the local context and global +contextual information of the combined features. To address these challenges, +we propose FLDNet (Foreground-Long-Distance Network), a Transformer-based +neural network that captures long-distance dependencies for accurate polyp +segmentation. Specifically, the proposed model consists of three main modules: +a pyramid-based Transformer encoder, a local context module, and a +foreground-Aware module. Multilevel features with long-distance dependency +information are first captured by the pyramid-based transformer encoder. On the +high-level features, the local context module obtains the local characteristics +related to the polyps by constructing different local context information. The +coarse map obtained by decoding the reconstructed highest-level features guides +the feature fusion process in the foreground-Aware module of the high-level +features to achieve foreground enhancement of the polyps. Our proposed method, +FLDNet, was evaluated using seven metrics on common datasets and demonstrated +superiority over state-of-the-art methods on widely-used evaluation measures. + +
+
+
+
+
+ + ☆ Self-supervised Extraction of Human Motion Structures via Frame-wise + Discrete Features + + +
+ The present paper proposes an encoder-decoder model for extracting the +structures of human motions represented by frame-wise discrete features in a +self-supervised manner. In the proposed method, features are extracted as codes +in a motion codebook without the use of human knowledge, and the relationship +between these codes can be visualized on a graph. Since the codes are expected +to be temporally sparse compared to the captured frame rate and can be shared +by multiple sequences, the proposed network model also addresses the need for +training constraints. Specifically, the model consists of self-attention layers +and a vector clustering block. The attention layers contribute to finding +sparse keyframes and discrete features as motion codes, which are then +extracted by vector clustering. The constraints are realized as training losses +so that the same motion codes can be as contiguous as possible and can be +shared by multiple sequences. In addition, we propose the use of causal +self-attention as a method by which to calculate attention for long sequences +consisting of numerous frames. In our experiments, the sparse structures of +motion codes were used to compile a graph that facilitates visualization of the +relationship between the codes and the differences between sequences. We then +evaluated the effectiveness of the extracted motion codes by applying them to +multiple recognition tasks and found that performance levels comparable to +task-optimized methods could be achieved by linear probing. + +
+
+
+
+
+ + ☆ Beyond Generation: Harnessing Text to Image Models for Object Detection + and Segmentation + + +
+ We propose a new paradigm to automatically generate training data with +accurate labels at scale using the text-to-image synthesis frameworks (e.g., +DALL-E, Stable Diffusion, etc.). The proposed approach1 decouples training data +generation into foreground object generation, and contextually coherent +background generation. To generate foreground objects, we employ a +straightforward textual template, incorporating the object class name as input +prompts. This is fed into a text-to-image synthesis framework, producing +various foreground images set against isolated backgrounds. A +foreground-background segmentation algorithm is then used to generate +foreground object masks. To generate context images, we begin by creating +language descriptions of the context. This is achieved by applying an image +captioning method to a small set of images representing the desired context. +These textual descriptions are then transformed into a diverse array of context +images via a text-to-image synthesis framework. Subsequently, we composite +these with the foreground object masks produced in the initial step, utilizing +a cut-and-paste method, to formulate the training data. We demonstrate the +advantages of our approach on five object detection and segmentation datasets, +including Pascal VOC and COCO. We found that detectors trained solely on +synthetic data produced by our method achieve performance comparable to those +trained on real data (Fig. 1). Moreover, a combination of real and synthetic +data yields even much better results. Further analysis indicates that the +synthetic data distribution complements the real data distribution effectively. +Additionally, we emphasize the compositional nature of our data generation +approach in out-of-distribution and zero-shot data generation scenarios. We +open-source our code at https://github.com/gyhandy/Text2Image-for-Detection + +
+
+ comment: Code in https://github.com/gyhandy/Text2Image-for-Detection +
+
+
+
+
+ + ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities across a variety of vision and multimodal +tasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box +setting, requiring access to model parameters for backpropagation. However, +many VLMs rely on proprietary data and are not open-source, which restricts the +use of white-box approaches for fine-tuning. Given that popular private large +language models (LLMs) like ChatGPT still offer a language-based user +interface, we aim to develop a novel fine-tuning approach for VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or output logits. In this setup, we propose employing +chat-based LLMs as black-box optimizers to search for the best text prompt on +the illustrative task of few-shot image classification using CLIP. +Specifically, we adopt an automatic "hill-climbing" procedure that converges on +an effective prompt by evaluating the accuracy of current prompts and asking +LLMs to refine them based on textual feedback, all within a conversational +process without human-in-the-loop. In a challenging 1-shot learning setup, our +simple approach surpasses the white-box continuous prompting method CoOp by an +average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms OpenAI's manually crafted prompts and is more efficient than other +black-box methods like iterative APE. Additionally, we highlight the advantage +of conversational feedback incorporating both positive and negative prompts, +suggesting that LLMs can utilize the implicit "gradient" direction in textual +feedback for a more efficient search. Lastly, we find that the text prompts +generated through our strategy are not only more interpretable but also +transfer well across different CLIP architectures in a black-box manner. + +
+
+
+
+
+ + ☆ Knowledge-Guided Short-Context Action Anticipation in Human-Centric + Videos ICCV 2023 + + +
+ This work focuses on anticipating long-term human actions, particularly using +short video segments, which can speed up editing workflows through improved +suggestions while fostering creativity by suggesting narratives. To this end, +we imbue a transformer network with a symbolic knowledge graph for action +anticipation in video segments by boosting certain aspects of the transformer's +attention mechanism at run-time. Demonstrated on two benchmark datasets, +Breakfast and 50Salads, our approach outperforms current state-of-the-art +methods for long-term action anticipation using short video context by up to +9%. + +
+
+ comment: ICCV 2023 Workshop on AI for Creative Video Editing and Understanding +
+
+
+
+
+ + ☆ Combining deep learning and street view imagery to map smallholder crop + types AAAI-24 + + +
+ Accurate crop type maps are an essential source of information for monitoring +yield progress at scale, projecting global crop production, and planning +effective policies. To date, however, crop type maps remain challenging to +create in low and middle-income countries due to a lack of ground truth labels +for training machine learning models. Field surveys are the gold standard in +terms of accuracy but require an often-prohibitively large amount of time, +money, and statistical capacity. In recent years, street-level imagery, such as +Google Street View, KartaView, and Mapillary, has become available around the +world. Such imagery contains rich information about crop types grown at +particular locations and times. In this work, we develop an automated system to +generate crop type ground references using deep learning and Google Street View +imagery. The method efficiently curates a set of street view images containing +crop fields, trains a model to predict crop type by utilizing weakly-labelled +images from disparate out-of-domain sources, and combines predicted labels with +remote sensing time series to create a wall-to-wall crop type map. We show +that, in Thailand, the resulting country-wide map of rice, cassava, maize, and +sugarcane achieves an accuracy of 93%. As the availability of roadside imagery +expands, our pipeline provides a way to map crop types at scale around the +globe, especially in underserved smallholder regions. + +
+
+ comment: Submitted to AAAI-24: Special Track on AI for Social Impact +
+
+
+
+
+ + ☆ Introducing Shape Prior Module in Diffusion Model for Medical Image + Segmentation + + +
+ Medical image segmentation is critical for diagnosing and treating spinal +disorders. However, the presence of high noise, ambiguity, and uncertainty +makes this task highly challenging. Factors such as unclear anatomical +boundaries, inter-class similarities, and irrational annotations contribute to +this challenge. Achieving both accurate and diverse segmentation templates is +essential to support radiologists in clinical practice. In recent years, +denoising diffusion probabilistic modeling (DDPM) has emerged as a prominent +research topic in computer vision. It has demonstrated effectiveness in various +vision tasks, including image deblurring, super-resolution, anomaly detection, +and even semantic representation generation at the pixel level. Despite the +robustness of existing diffusion models in visual generation tasks, they still +struggle with discrete masks and their various effects. To address the need for +accurate and diverse spine medical image segmentation templates, we propose an +end-to-end framework called VerseDiff-UNet, which leverages the denoising +diffusion probabilistic model (DDPM). Our approach integrates the diffusion +model into a standard U-shaped architecture. At each step, we combine the +noise-added image with the labeled mask to guide the diffusion direction +accurately towards the target region. Furthermore, to capture specific +anatomical a priori information in medical images, we incorporate a shape a +priori module. This module efficiently extracts structural semantic information +from the input spine images. We evaluate our method on a single dataset of +spine images acquired through X-ray imaging. Our results demonstrate that +VerseDiff-UNet significantly outperforms other state-of-the-art methods in +terms of accuracy while preserving the natural features and variations of +anatomy. + +
+
+
+
+
+ + ☆ Deep evidential fusion with uncertainty quantification and contextual + discounting for multimodal medical image segmentation + + +
+ Single-modality medical images generally do not contain enough information to +reach an accurate and reliable diagnosis. For this reason, physicians generally +diagnose diseases based on multimodal medical images such as, e.g., PET/CT. The +effective fusion of multimodal information is essential to reach a reliable +decision and explain how the decision is made as well. In this paper, we +propose a fusion framework for multimodal medical image segmentation based on +deep learning and the Dempster-Shafer theory of evidence. In this framework, +the reliability of each single modality image when segmenting different objects +is taken into account by a contextual discounting operation. The discounted +pieces of evidence from each modality are then combined by Dempster's rule to +reach a final decision. Experimental results with a PET-CT dataset with +lymphomas and a multi-MRI dataset with brain tumors show that our method +outperforms the state-of-the-art methods in accuracy and reliability. + +
+
+
+
+
+ + ☆ Medical Image Segmentation with Belief Function Theory and Deep Learning + + +
+ Deep learning has shown promising contributions in medical image segmentation +with powerful learning and feature representation abilities. However, it has +limitations for reasoning with and combining imperfect (imprecise, uncertain, +and partial) information. In this thesis, we study medical image segmentation +approaches with belief function theory and deep learning, specifically focusing +on information modeling and fusion based on uncertain evidence. + First, we review existing belief function theory-based medical image +segmentation methods and discuss their advantages and challenges. Second, we +present a semi-supervised medical image segmentation framework to decrease the +uncertainty caused by the lack of annotations with evidential segmentation and +evidence fusion. Third, we compare two evidential classifiers, evidential +neural network and radial basis function network, and show the effectiveness of +belief function theory in uncertainty quantification; we use the two evidential +classifiers with deep neural networks to construct deep evidential models for +lymphoma segmentation. Fourth, we present a multimodal medical image fusion +framework taking into account the reliability of each MR image source when +performing different segmentation tasks using mass functions and contextual +discounting. + +
+
+ comment: Ph.D. Thesis +
+
+
+
+
+ + ☆ Quality-Agnostic Deepfake Detection with Intra-model Collaborative + Learning + + +
+ Deepfake has recently raised a plethora of societal concerns over its +possible security threats and dissemination of fake information. Much research +on deepfake detection has been undertaken. However, detecting low quality as +well as simultaneously detecting different qualities of deepfakes still remains +a grave challenge. Most SOTA approaches are limited by using a single specific +model for detecting certain deepfake video quality type. When constructing +multiple models with prior information about video quality, this kind of +strategy incurs significant computational cost, as well as model and training +data overhead. Further, it cannot be scalable and practical to deploy in +real-world settings. In this work, we propose a universal intra-model +collaborative learning framework to enable the effective and simultaneous +detection of different quality of deepfakes. That is, our approach is the +quality-agnostic deepfake detection method, dubbed QAD . In particular, by +observing the upper bound of general error expectation, we maximize the +dependency between intermediate representations of images from different +quality levels via Hilbert-Schmidt Independence Criterion. In addition, an +Adversarial Weight Perturbation module is carefully devised to enable the model +to be more robust against image corruption while boosting the overall model's +performance. Extensive experiments over seven popular deepfake datasets +demonstrate the superiority of our QAD model over prior SOTA benchmarks. + +
+
+
+
+
+ + ☆ Enhancing Representation in Radiography-Reports Foundation Model: A + Granular Alignment Algorithm Using Masked Contrastive Learning + + +
+ Recently, multi-modal vision-language foundation models have gained +significant attention in the medical field. While these models offer great +opportunities, they still face a number of challenges, such as the requirement +for fine-grained knowledge understanding in computer-aided diagnosis and +capability of utilizing very limited or no task-specific labeled data in +real-world clinical applications. In this study, we present MaCo, a novel +multi-modal medical foundation model that explores masked contrastive learning +to achieve granular alignment and zero-shot learning for a variety of medical +imaging tasks. MaCo incorporates a correlation weighting mechanism to adjust +the correlation between masked image patches and their corresponding reports, +thereby enhancing the representation learning capabilities. We evaluate MaCo on +six well-known open-source X-ray datasets, and the experimental results show it +outperforms seven state-of-the-art approaches for classification, segmentation, +and zero-shot phase grounding, demonstrating its great potential to promote a +wide range of medical image analysis tasks. + +
+
+
+
+
+ + ☆ Adversarial Attacks Assessment of Salient Object Detection via Symbolic + Learning + + +
+ Machine learning is at the center of mainstream technology and outperforms +classical approaches to handcrafted feature design. Aside from its learning +process for artificial feature extraction, it has an end-to-end paradigm from +input to output, reaching outstandingly accurate results. However, security +concerns about its robustness to malicious and imperceptible perturbations have +drawn attention since its prediction can be changed entirely. Salient object +detection is a research area where deep convolutional neural networks have +proven effective but whose trustworthiness represents a significant issue +requiring analysis and solutions to hackers' attacks. Brain programming is a +kind of symbolic learning in the vein of good old-fashioned artificial +intelligence. This work provides evidence that symbolic learning robustness is +crucial in designing reliable visual attention systems since it can withstand +even the most intense perturbations. We test this evolutionary computation +methodology against several adversarial attacks and noise perturbations using +standard databases and a real-world problem of a shorebird called the Snowy +Plover portraying a visual attention task. We compare our methodology with five +different deep learning approaches, proving that they do not match the symbolic +paradigm regarding robustness. All neural networks suffer significant +performance losses, while brain programming stands its ground and remains +unaffected. Also, by studying the Snowy Plover, we remark on the importance of +security in surveillance activities regarding wildlife protection and +conservation. + +
+
+ comment: 14 pages, 8 figures, 6 tables, IEEE Transactions on Emerging Topics + in Computing, Accepted for publication +
+
+
+
+
+ + ☆ Hierarchical Conditional Semi-Paired Image-to-Image Translation For + Multi-Task Image Defect Correction On Shopping Websites ICIP 2023 + + +
+ On shopping websites, product images of low quality negatively affect +customer experience. Although there are plenty of work in detecting images with +different defects, few efforts have been dedicated to correct those defects at +scale. A major challenge is that there are thousands of product types and each +has specific defects, therefore building defect specific models is unscalable. +In this paper, we propose a unified Image-to-Image (I2I) translation model to +correct multiple defects across different product types. Our model leverages an +attention mechanism to hierarchically incorporate high-level defect groups and +specific defect types to guide the network to focus on defect-related image +regions. Evaluated on eight public datasets, our model reduces the Frechet +Inception Distance (FID) by 24.6% in average compared with MoNCE, the +state-of-the-art I2I method. Unlike public data, another practical challenge on +shopping websites is that some paired images are of low quality. Therefore we +design our model to be semi-paired by combining the L1 loss of paired data with +the cycle loss of unpaired data. Tested on a shopping website dataset to +correct three image defects, our model reduces (FID) by 63.2% in average +compared with WS-I2I, the state-of-the art semi-paired I2I method. + +
+
+ comment: 6 pages, 6 figures, 3 tables. To be published in ICIP 2023 +
+
+
+
+
+ + ☆ Generalized Attacks on Face Verification Systems + + +
+ Face verification (FV) using deep neural network models has made tremendous +progress in recent years, surpassing human accuracy and seeing deployment in +various applications such as border control and smartphone unlocking. However, +FV systems are vulnerable to Adversarial Attacks, which manipulate input images +to deceive these systems in ways usually unnoticeable to humans. This paper +provides an in-depth study of attacks on FV systems. We introduce the +DodgePersonation Attack that formulates the creation of face images that +impersonate a set of given identities while avoiding being identified as any of +the identities in a separate, disjoint set. A taxonomy is proposed to provide a +unified view of different types of Adversarial Attacks against FV systems, +including Dodging Attacks, Impersonation Attacks, and Master Face Attacks. +Finally, we propose the ''One Face to Rule Them All'' Attack which implements +the DodgePersonation Attack with state-of-the-art performance on a well-known +scenario (Master Face Attack) and which can also be used for the new scenarios +introduced in this paper. While the state-of-the-art Master Face Attack can +produce a set of 9 images to cover 43.82% of the identities in their test +database, with 9 images our attack can cover 57.27% to 58.5% of these +identifies while giving the attacker the choice of the identity to use to +create the impersonation. Moreover, the 9 generated attack images appear +identical to a casual observer. + +
+
+
+
+
+ + ☆ Accelerating Deep Neural Networks via Semi-Structured Activation + Sparsity + + +
+ The demand for efficient processing of deep neural networks (DNNs) on +embedded devices is a significant challenge limiting their deployment. +Exploiting sparsity in the network's feature maps is one of the ways to reduce +its inference latency. It is known that unstructured sparsity results in lower +accuracy degradation with respect to structured sparsity but the former needs +extensive inference engine changes to get latency benefits. To tackle this +challenge, we propose a solution to induce semi-structured activation sparsity +exploitable through minor runtime modifications. To attain high speedup levels +at inference time, we design a sparse training procedure with awareness of the +final position of the activations while computing the General Matrix +Multiplication (GEMM). We extensively evaluate the proposed solution across +various models for image classification and object detection tasks. Remarkably, +our approach yields a speed improvement of $1.25 \times$ with a minimal +accuracy drop of $1.1\%$ for the ResNet18 model on the ImageNet dataset. +Furthermore, when combined with a state-of-the-art structured pruning method, +the resulting models provide a good latency-accuracy trade-off, outperforming +models that solely employ structured pruning techniques. + +
+
+ comment: Code is available at http://github.com/Deeplite/activ-sparse +
+
+
+
+
+ + ☆ Multi-dimensional Fusion and Consistency for Semi-supervised Medical + Image Segmentation + + +
+ In this paper, we introduce a novel semi-supervised learning framework +tailored for medical image segmentation. Central to our approach is the +innovative Multi-scale Text-aware ViT-CNN Fusion scheme. This scheme adeptly +combines the strengths of both ViTs and CNNs, capitalizing on the unique +advantages of both architectures as well as the complementary information in +vision-language modalities. Further enriching our framework, we propose the +Multi-Axis Consistency framework for generating robust pseudo labels, thereby +enhancing the semi-supervised learning process. Our extensive experiments on +several widely-used datasets unequivocally demonstrate the efficacy of our +approach. + +
+
+
+
+
+ + ☆ Harmonic-NAS: Hardware-Aware Multimodal Neural Architecture Search on + Resource-constrained Devices ACML 2023 + + +
+ The recent surge of interest surrounding Multimodal Neural Networks (MM-NN) +is attributed to their ability to effectively process and integrate information +from diverse data sources. In MM-NN, features are extracted and fused from +multiple modalities using adequate unimodal backbones and specific fusion +networks. Although this helps strengthen the multimodal information +representation, designing such networks is labor-intensive. It requires tuning +the architectural parameters of the unimodal backbones, choosing the fusing +point, and selecting the operations for fusion. Furthermore, multimodality AI +is emerging as a cutting-edge option in Internet of Things (IoT) systems where +inference latency and energy consumption are critical metrics in addition to +accuracy. In this paper, we propose Harmonic-NAS, a framework for the joint +optimization of unimodal backbones and multimodal fusion networks with hardware +awareness on resource-constrained devices. Harmonic-NAS involves a two-tier +optimization approach for the unimodal backbone architectures and fusion +strategy and operators. By incorporating the hardware dimension into the +optimization, evaluation results on various devices and multimodal datasets +have demonstrated the superiority of Harmonic-NAS over state-of-the-art +approaches achieving up to 10.9% accuracy improvement, 1.91x latency reduction, +and 2.14x energy efficiency gain. + +
+
+ comment: Accepted to the 15th Asian Conference on Machine Learning (ACML 2023) +
+
+
+
+
+ + ☆ Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and + Reasoning + + +
+ The widespread adoption of commercial autonomous vehicles (AVs) and advanced +driver assistance systems (ADAS) may largely depend on their acceptance by +society, for which their perceived trustworthiness and interpretability to +riders are crucial. In general, this task is challenging because modern +autonomous systems software relies heavily on black-box artificial intelligence +models. Towards this goal, this paper introduces a novel dataset, Rank2Tell, a +multi-modal ego-centric dataset for Ranking the importance level and Telling +the reason for the importance. Using various close and open-ended visual +question answering, the dataset provides dense annotations of various semantic, +spatial, temporal, and relational attributes of various important objects in +complex traffic scenarios. The dense annotations and unique attributes of the +dataset make it a valuable resource for researchers working on visual scene +understanding and related fields. Further, we introduce a joint model for joint +importance level ranking and natural language captions generation to benchmark +our dataset and demonstrate performance with quantitative evaluations. + +
+
+
+
+
+ + ☆ Zero-Shot Visual Classification with Guided Cropping + + +
+ Pretrained vision-language models, such as CLIP, show promising zero-shot +performance across a wide variety of datasets. For closed-set classification +tasks, however, there is an inherent limitation: CLIP image encoders are +typically designed to extract generic image-level features that summarize +superfluous or confounding information for the target tasks. This results in +degradation of classification performance, especially when objects of interest +cover small areas of input images. In this work, we propose CLIP with Guided +Cropping (GC-CLIP), where we use an off-the-shelf zero-shot object detection +model in a preprocessing step to increase focus of zero-shot classifier to the +object of interest and minimize influence of extraneous image regions. We +empirically show that our approach improves zero-shot classification results +across architectures and datasets, favorably for small objects. + +
+
+
+
+
+ + ☆ AmodalSynthDrive: A Synthetic Amodal Perception Dataset for Autonomous + Driving + + +
+ Unlike humans, who can effortlessly estimate the entirety of objects even +when partially occluded, modern computer vision algorithms still find this +aspect extremely challenging. Leveraging this amodal perception for autonomous +driving remains largely untapped due to the lack of suitable datasets. The +curation of these datasets is primarily hindered by significant annotation +costs and mitigating annotator subjectivity in accurately labeling occluded +regions. To address these limitations, we introduce AmodalSynthDrive, a +synthetic multi-task multi-modal amodal perception dataset. The dataset +provides multi-view camera images, 3D bounding boxes, LiDAR data, and odometry +for 150 driving sequences with over 1M object annotations in diverse traffic, +weather, and lighting conditions. AmodalSynthDrive supports multiple amodal +scene understanding tasks including the introduced amodal depth estimation for +enhanced spatial understanding. We evaluate several baselines for each of these +tasks to illustrate the challenges and set up public benchmarking servers. The +dataset is available at http://amodalsynthdrive.cs.uni-freiburg.de. + +
+
+
+
+
+ + ☆ Strong-Weak Integrated Semi-supervision for Unsupervised Single and + Multi Target Domain Adaptation + + +
+ Unsupervised domain adaptation (UDA) focuses on transferring knowledge +learned in the labeled source domain to the unlabeled target domain. Despite +significant progress that has been achieved in single-target domain adaptation +for image classification in recent years, the extension from single-target to +multi-target domain adaptation is still a largely unexplored problem area. In +general, unsupervised domain adaptation faces a major challenge when attempting +to learn reliable information from a single unlabeled target domain. Increasing +the number of unlabeled target domains further exacerbate the problem rather +significantly. In this paper, we propose a novel strong-weak integrated +semi-supervision (SWISS) learning strategy for image classification using +unsupervised domain adaptation that works well for both single-target and +multi-target scenarios. Under the proposed SWISS-UDA framework, a strong +representative set with high confidence but low diversity target domain samples +and a weak representative set with low confidence but high diversity target +domain samples are updated constantly during the training process. Both sets +are fused to generate an augmented strong-weak training batch with +pseudo-labels to train the network during every iteration. The extension from +single-target to multi-target domain adaptation is accomplished by exploring +the class-wise distance relationship between domains and replacing the strong +representative set with much stronger samples from peer domains via peer +scaffolding. Moreover, a novel adversarial logit loss is proposed to reduce the +intra-class divergence between source and target domains, which is +back-propagated adversarially with a gradient reverse layer between the +classifier and the rest of the network. Experimental results based on three +benchmarks, Office-31, Office-Home, and DomainNet, show the effectiveness of +the proposed SWISS framework. + +
+
+
+
+
+ + ☆ Ethnicity and Biometric Uniqueness: Iris Pattern Individuality in a West + African Database + + +
+ We conducted more than 1.3 million comparisons of iris patterns encoded from +images collected at two Nigerian universities, which constitute the newly +available African Human Iris (AFHIRIS) database. The purpose was to discover +whether ethnic differences in iris structure and appearance such as the +textural feature size, as contrasted with an all-Chinese image database or an +American database in which only 1.53% were of African-American heritage, made a +material difference for iris discrimination. We measured a reduction in entropy +for the AFHIRIS database due to the coarser iris features created by the thick +anterior layer of melanocytes, and we found stochastic parameters that +accurately model the relevant empirical distributions. Quantile-Quantile +analysis revealed that a very small change in operational decision thresholds +for the African database would compensate for the reduced entropy and generate +the same performance in terms of resistance to False Matches. We conclude that +despite demographic difference, individuality can be robustly discerned by +comparison of iris patterns in this West African population. + +
+
+ comment: 8 pages, 8 Figures +
+
+
+
+
+ + ☆ DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio + Cross-Attention and Facial Self-Attention + + +
+ With the rise in manipulated media, deepfake detection has become an +imperative task for preserving the authenticity of digital content. In this +paper, we present a novel multi-modal audio-video framework designed to +concurrently process audio and video inputs for deepfake detection tasks. Our +model capitalizes on lip synchronization with input audio through a +cross-attention mechanism while extracting visual cues via a fine-tuned VGG-16 +network. Subsequently, a transformer encoder network is employed to perform +facial self-attention. We conduct multiple ablation studies highlighting +different strengths of our approach. Our multi-modal methodology outperforms +state-of-the-art multi-modal deepfake detection techniques in terms of F-1 and +per-video AUC scores. + +
+
+
+
+
+ + ☆ Action Segmentation Using 2D Skeleton Heatmaps + + +
+ This paper presents a 2D skeleton-based action segmentation method with +applications in fine-grained human activity recognition. In contrast with +state-of-the-art methods which directly take sequences of 3D skeleton +coordinates as inputs and apply Graph Convolutional Networks (GCNs) for +spatiotemporal feature learning, our main idea is to use sequences of 2D +skeleton heatmaps as inputs and employ Temporal Convolutional Networks (TCNs) +to extract spatiotemporal features. Despite lacking 3D information, our +approach yields comparable/superior performances and better robustness against +missing keypoints than previous methods on action segmentation datasets. +Moreover, we improve the performances further by using both 2D skeleton +heatmaps and RGB videos as inputs. To our best knowledge, this is the first +work to utilize 2D skeleton heatmap inputs and the first work to explore 2D +skeleton+RGB fusion for action segmentation. + +
+
+
+
+
+ + ♻ ☆ Measuring Self-Supervised Representation Quality for Downstream + Classification using Discriminative Features + + +
+ Self-supervised learning (SSL) has shown impressive results in downstream +classification tasks. However, there is limited work in understanding their +failure modes and interpreting their learned representations. In this paper, we +study the representation space of state-of-the-art self-supervised models +including SimCLR, SwaV, MoCo, BYOL, DINO, SimSiam, VICReg and Barlow Twins. +Without the use of class label information, we discover discriminative features +that correspond to unique physical attributes in images, present mostly in +correctly-classified representations. Using these features, we can compress the +representation space by up to 40% without significantly affecting linear +classification performance. We then propose Self-Supervised Representation +Quality Score (or Q-Score), an unsupervised score that can reliably predict if +a given sample is likely to be mis-classified during linear evaluation, +achieving AUPRC of 91.45 on ImageNet-100 and 78.78 on ImageNet-1K. Q-Score can +also be used as a regularization term on pre-trained encoders to remedy +low-quality representations. Fine-tuning with Q-Score regularization can boost +the linear probing accuracy of SSL models by up to 5.8% on ImageNet-100 and +3.7% on ImageNet-1K compared to their baselines. Finally, using gradient +heatmaps and Salient ImageNet masks, we define a metric to quantify the +interpretability of each representation. We show that discriminative features +are strongly correlated to core attributes and, enhancing these features +through Q-score regularization makes SSL representations more interpretable. + +
+
+
+
+
+ + ♻ ☆ Point-SLAM: Dense Neural Point Cloud-based SLAM ICCV 2023 + + +
+ We propose a dense neural simultaneous localization and mapping (SLAM) +approach for monocular RGBD input which anchors the features of a neural scene +representation in a point cloud that is iteratively generated in an +input-dependent data-driven manner. We demonstrate that both tracking and +mapping can be performed with the same point-based neural scene representation +by minimizing an RGBD-based re-rendering loss. In contrast to recent dense +neural SLAM methods which anchor the scene features in a sparse grid, our +point-based approach allows dynamically adapting the anchor point density to +the information density of the input. This strategy reduces runtime and memory +usage in regions with fewer details and dedicates higher point density to +resolve fine details. Our approach performs either better or competitive to +existing dense neural RGBD SLAM methods in tracking, mapping and rendering +accuracy on the Replica, TUM-RGBD and ScanNet datasets. The source code is +available at https://github.com/eriksandstroem/Point-SLAM. + +
+
+ comment: ICCV 2023. 18 Pages, 12 Figures +
+
+
+
+
+ + ♻ ☆ You Only Label Once: 3D Box Adaptation from Point Cloud to Image via + Semi-Supervised Learning + + +
+ The image-based 3D object detection task expects that the predicted 3D +bounding box has a ``tightness'' projection (also referred to as cuboid), which +fits the object contour well on the image while still keeping the geometric +attribute on the 3D space, e.g., physical dimension, pairwise orthogonal, etc. +These requirements bring significant challenges to the annotation. Simply +projecting the Lidar-labeled 3D boxes to the image leads to non-trivial +misalignment, while directly drawing a cuboid on the image cannot access the +original 3D information. In this work, we propose a learning-based 3D box +adaptation approach that automatically adjusts minimum parameters of the +360$^{\circ}$ Lidar 3D bounding box to perfectly fit the image appearance of +panoramic cameras. With only a few 2D boxes annotation as guidance during the +training phase, our network can produce accurate image-level cuboid annotations +with 3D properties from Lidar boxes. We call our method ``you only label +once'', which means labeling on the point cloud once and automatically adapting +to all surrounding cameras. As far as we know, we are the first to focus on +image-level cuboid refinement, which balances the accuracy and efficiency well +and dramatically reduces the labeling effort for accurate cuboid annotation. +Extensive experiments on the public Waymo and NuScenes datasets show that our +method can produce human-level cuboid annotation on the image without needing +manual adjustment. + +
+
+
+
+
+ + ♻ ☆ Tracking Everything Everywhere All at Once ICCV 2023 + + +
+ We present a new test-time optimization method for estimating dense and +long-range motion from a video sequence. Prior optical flow or particle video +tracking algorithms typically operate within limited temporal windows, +struggling to track through occlusions and maintain global consistency of +estimated motion trajectories. We propose a complete and globally consistent +motion representation, dubbed OmniMotion, that allows for accurate, full-length +motion estimation of every pixel in a video. OmniMotion represents a video +using a quasi-3D canonical volume and performs pixel-wise tracking via +bijections between local and canonical space. This representation allows us to +ensure global consistency, track through occlusions, and model any combination +of camera and object motion. Extensive evaluations on the TAP-Vid benchmark and +real-world footage show that our approach outperforms prior state-of-the-art +methods by a large margin both quantitatively and qualitatively. See our +project page for more results: http://omnimotion.github.io/ + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ High-Fidelity Eye Animatable Neural Radiance Fields for Human Face BMVC2023 + + +
+ Face rendering using neural radiance fields (NeRF) is a rapidly developing +research area in computer vision. While recent methods primarily focus on +controlling facial attributes such as identity and expression, they often +overlook the crucial aspect of modeling eyeball rotation, which holds +importance for various downstream tasks. In this paper, we aim to learn a face +NeRF model that is sensitive to eye movements from multi-view images. We +address two key challenges in eye-aware face NeRF learning: how to effectively +capture eyeball rotation for training and how to construct a manifold for +representing eyeball rotation. To accomplish this, we first fit FLAME, a +well-established parametric face model, to the multi-view images considering +multi-view consistency. Subsequently, we introduce a new Dynamic Eye-aware NeRF +(DeNeRF). DeNeRF transforms 3D points from different views into a canonical +space to learn a unified face NeRF model. We design an eye deformation field +for the transformation, including rigid transformation, e.g., eyeball rotation, +and non-rigid transformation. Through experiments conducted on the ETH-XGaze +dataset, we demonstrate that our model is capable of generating high-fidelity +images with accurate eyeball rotation and non-rigid periocular deformation, +even under novel viewing angles. Furthermore, we show that utilizing the +rendered images can effectively enhance gaze estimation performance. + +
+
+ comment: BMVC2023 Oral +
+
+
+
+
+ + ♻ ☆ Plant Disease Detection using Region-Based Convolutional Neural Network + + +
+ Agriculture plays an important role in the food and economy of Bangladesh. +The rapid growth of population over the years also has increased the demand for +food production. One of the major reasons behind low crop production is +numerous bacteria, virus and fungal plant diseases. Early detection of plant +diseases and proper usage of pesticides and fertilizers are vital for +preventing the diseases and boost the yield. Most of the farmers use +generalized pesticides and fertilizers in the entire fields without +specifically knowing the condition of the plants. Thus the production cost +oftentimes increases, and, not only that, sometimes this becomes detrimental to +the yield. Deep Learning models are found to be very effective to automatically +detect plant diseases from images of plants, thereby reducing the need for +human specialists. This paper aims at building a lightweight deep learning +model for predicting leaf disease in tomato plants. By modifying the +region-based convolutional neural network, we design an efficient and effective +model that demonstrates satisfactory empirical performance on a benchmark +dataset. Our proposed model can easily be deployed in a larger system where +drones take images of leaves and these images will be fed into our model to +know the health condition. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ JOSA: Joint surface-based registration with atlas construction enables + accurate alignment of the brain geometry and function + + +
+ Surface-based cortical registration is an important topic in medical image +analysis and facilitates many downstream applications. Current approaches for +cortical registration are mainly driven by geometric features, such as sulcal +depth and curvature, and often assume that registration of folding patterns +leads to alignment of brain function. However, functional variability of +anatomically corresponding areas across subjects has been widely reported, +particularly in higher-order cognitive areas. In this work, we present JOSA, a +novel cortical registration framework that jointly models the mismatch between +geometry and function while simultaneously learning an unbiased +population-specific atlas. Using a semi-supervised training strategy, JOSA +achieves superior registration performance in both geometry and function +without requiring functional data at inference. This learning framework can be +extended to any auxiliary data to guide spherical registration that is +available during training but is difficult or impossible to obtain during +inference, such as parcellations, architectonic identity, transcriptomic +information, and molecular profiles. + +
+
+ comment: A. V. Dalca and B. Fischl are co-senior authors with equal + contribution +
+
+
+
+
+ + ♻ ☆ ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models ICCV 2023 + + +
+ Large-scale vision-language models (VLMs) like CLIP successfully find +correspondences between images and text. Through the standard deterministic +mapping process, an image or a text sample is mapped to a single vector in the +embedding space. This is problematic: as multiple samples (images or text) can +abstract the same concept in the physical world, deterministic embeddings do +not reflect the inherent ambiguity in the embedding space. We propose ProbVLM, +a probabilistic adapter that estimates probability distributions for the +embeddings of pre-trained VLMs via inter/intra-modal alignment in a post-hoc +manner without needing large-scale datasets or computing. On four challenging +datasets, i.e., COCO, Flickr, CUB, and Oxford-flowers, we estimate the +multi-modal embedding uncertainties for two VLMs, i.e., CLIP and BLIP, quantify +the calibration of embedding uncertainties in retrieval tasks and show that +ProbVLM outperforms other methods. Furthermore, we propose active learning and +model selection as two real-world downstream tasks for VLMs and show that the +estimated uncertainty aids both tasks. Lastly, we present a novel technique for +visualizing the embedding distributions using a large-scale pre-trained latent +diffusion model. Code is available at https://github.com/ExplainableML/ProbVLM. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI + Generation and Diffuse Glioma Growth Prediction + + +
+ Diffuse gliomas are malignant brain tumors that grow widespread through the +brain. The complex interactions between neoplastic cells and normal tissue, as +well as the treatment-induced changes often encountered, make glioma tumor +growth modeling challenging. In this paper, we present a novel end-to-end +network capable of generating future tumor masks and realistic MRIs of how the +tumor will look at any future time points for different treatment plans. Our +model is built upon cutting-edge diffusion probabilistic models and +deep-segmentation neural networks. We extended a diffusion model to include +sequential multi-parametric MRI and treatment information as conditioning input +to guide the generative diffusion process. This allows us to estimate tumor +growth at any given time point. We trained the model using real-world +postoperative longitudinal MRI data with glioma tumor growth trajectories +represented as tumor segmentation maps over time. The model has demonstrated +promising performance across a range of tasks, including the generation of +high-quality synthetic MRIs with tumor masks, time-series tumor segmentations, +and uncertainty estimation. Combined with the treatment-aware generated MRIs, +the tumor growth predictions with uncertainty estimates can provide useful +information for clinical decision-making. + +
+
+ comment: 13 pages, 10 figures, 2 tables, 2 agls, preprints in the IEEE trans. + format for submission to IEEE-TMI +
+
+
+
+
+ + ♻ ☆ FreeMan: Towards Benchmarking 3D Human Pose Estimation in the Wild + + +
+ Estimating the 3D structure of the human body from natural scenes is a +fundamental aspect of visual perception. This task carries great importance for +fields like AIGC and human-robot interaction. In practice, 3D human pose +estimation in real-world settings is a critical initial step in solving this +problem. However, the current datasets, often collected under controlled +laboratory conditions using complex motion capture equipment and unvarying +backgrounds, are insufficient. The absence of real-world datasets is stalling +the progress of this crucial task. To facilitate the development of 3D pose +estimation, we present FreeMan, the first large-scale, real-world multi-view +dataset. FreeMan was captured by synchronizing 8 smartphones across diverse +scenarios. It comprises 11M frames from 8000 sequences, viewed from different +perspectives. These sequences cover 40 subjects across 10 different scenarios, +each with varying lighting conditions. We have also established an automated, +precise labeling pipeline that allows for large-scale processing efficiently. +We provide comprehensive evaluation baselines for a range of tasks, underlining +the significant challenges posed by FreeMan. Further evaluations of standard +indoor/outdoor human sensing datasets reveal that FreeMan offers robust +representation transferability in real and complex scenes. FreeMan is now +publicly available at https://wangjiongw.github.io/freeman. + +
+
+ comment: 18 pages, 9 figures. Project page: + https://wangjiongw.github.io/freeman/ ; API: + https://github.com/wangjiongw/FreeMan_API +
+
+
+
+
+ + ♻ ☆ TMComposites: Plug-and-Play Collaboration Between Specialized Tsetlin + Machines + + +
+ Tsetlin Machines (TMs) provide a fundamental shift from arithmetic-based to +logic-based machine learning. Supporting convolution, they deal successfully +with image classification datasets like MNIST, Fashion-MNIST, and CIFAR-2. +However, the TM struggles with getting state-of-the-art performance on CIFAR-10 +and CIFAR-100, representing more complex tasks. This paper introduces +plug-and-play collaboration between specialized TMs, referred to as TM +Composites. The collaboration relies on a TM's ability to specialize during +learning and to assess its competence during inference. When teaming up, the +most confident TMs make the decisions, relieving the uncertain ones. In this +manner, a TM Composite becomes more competent than its members, benefiting from +their specializations. The collaboration is plug-and-play in that members can +be combined in any way, at any time, without fine-tuning. We implement three TM +specializations in our empirical evaluation: Histogram of Gradients, Adaptive +Gaussian Thresholding, and Color Thermometers. The resulting TM Composite +increases accuracy on Fashion-MNIST by two percentage points, CIFAR-10 by +twelve points, and CIFAR-100 by nine points, yielding new state-of-the-art +results for TMs. Overall, we envision that TM Composites will enable an +ultra-low energy and transparent alternative to state-of-the-art deep learning +on more tasks and datasets. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Fidelity of Interpretability Methods and Perturbation Artifacts in + Neural Networks + + +
+ Despite excellent performance of deep neural networks (DNNs) in image +classification, detection, and prediction, characterizing how DNNs make a given +decision remains an open problem, resulting in a number of interpretability +methods. Post-hoc interpretability methods primarily aim to quantify the +importance of input features with respect to the class probabilities. However, +due to the lack of ground truth and the existence of interpretability methods +with diverse operating characteristics, evaluating these methods is a crucial +challenge. A popular approach to evaluate interpretability methods is to +perturb input features deemed important for a given prediction and observe the +decrease in accuracy. However, perturbation itself may introduce artifacts. We +propose a method for estimating the impact of such artifacts on the fidelity +estimation by utilizing model accuracy curves from perturbing input features +according to the Most Import First (MIF) and Least Import First (LIF) orders. +Using the ResNet-50 trained on the ImageNet, we demonstrate the proposed +fidelity estimation of four popular post-hoc interpretability methods. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Revealing the preference for correcting separated aberrations in joint + optic-image design + + +
+ The joint design of the optical system and the downstream algorithm is a +challenging and promising task. Due to the demand for balancing the global +optimal of imaging systems and the computational cost of physical simulation, +existing methods cannot achieve efficient joint design of complex systems such +as smartphones and drones. In this work, starting from the perspective of the +optical design, we characterize the optics with separated aberrations. +Additionally, to bridge the hardware and software without gradients, an image +simulation system is presented to reproduce the genuine imaging procedure of +lenses with large field-of-views. As for aberration correction, we propose a +network to perceive and correct the spatially varying aberrations and validate +its superiority over state-of-the-art methods. Comprehensive experiments reveal +that the preference for correcting separated aberrations in joint design is as +follows: longitudinal chromatic aberration, lateral chromatic aberration, +spherical aberration, field curvature, and coma, with astigmatism coming last. +Drawing from the preference, a 10% reduction in the total track length of the +consumer-level mobile phone lens module is accomplished. Moreover, this +procedure spares more space for manufacturing deviations, realizing +extreme-quality enhancement of computational photography. The optimization +paradigm provides innovative insight into the practical joint design of +sophisticated optical systems and post-processing algorithms. + +
+
+ comment: submitted to Optica +
+
+
+
+
+ + ♻ ☆ PSO-Convolutional Neural Networks with Heterogeneous Learning Rate + + +
+ Convolutional Neural Networks (ConvNets or CNNs) have been candidly deployed +in the scope of computer vision and related fields. Nevertheless, the dynamics +of training of these neural networks lie still elusive: it is hard and +computationally expensive to train them. A myriad of architectures and training +strategies have been proposed to overcome this challenge and address several +problems in image processing such as speech, image and action recognition as +well as object detection. In this article, we propose a novel Particle Swarm +Optimization (PSO) based training for ConvNets. In such framework, the vector +of weights of each ConvNet is typically cast as the position of a particle in +phase space whereby PSO collaborative dynamics intertwines with Stochastic +Gradient Descent (SGD) in order to boost training performance and +generalization. Our approach goes as follows: i) [regular phase] each ConvNet +is trained independently via SGD; ii) [collaborative phase] ConvNets share +among themselves their current vector of weights (or particle-position) along +with their gradient estimates of the Loss function. Distinct step sizes are +coined by distinct ConvNets. By properly blending ConvNets with large (possibly +random) step-sizes along with more conservative ones, we propose an algorithm +with competitive performance with respect to other PSO-based approaches on +Cifar-10 and Cifar-100 (accuracy of 98.31% and 87.48%). These accuracy levels +are obtained by resorting to only four ConvNets -- such results are expected to +scale with the number of collaborative ConvNets accordingly. We make our source +codes available for download https://github.com/leonlha/PSO-ConvNet-Dynamics. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ Federated Learning for Medical Image Analysis: A Survey + + +
+ Machine learning in medical imaging often faces a fundamental dilemma, namely +the small sample size problem. Many recent studies suggest using multi-domain +data pooled from different acquisition sites/datasets to improve statistical +power. However, medical images from different sites cannot be easily shared to +build large datasets for model training due to privacy protection reasons. As a +promising solution, federated learning, which enables collaborative training of +machine learning models based on data from different sites without cross-site +data sharing, has attracted considerable attention recently. In this paper, we +conduct a comprehensive survey of the recent development of federated learning +methods in medical image analysis. We first introduce the background and +motivation of federated learning for dealing with privacy protection and +collaborative learning issues in medical imaging. We then present a +comprehensive review of recent advances in federated learning methods for +medical image analysis. Specifically, existing methods are categorized based on +three critical aspects of a federated learning system, including client end, +server end, and communication techniques. In each category, we summarize the +existing federated learning methods according to specific research problems in +medical image analysis and also provide insights into the motivations of +different approaches. In addition, we provide a review of existing benchmark +medical imaging datasets and software platforms for current federated learning +research. We also conduct an experimental study to empirically evaluate typical +federated learning methods for medical image analysis. This survey can help to +better understand the current research status, challenges and potential +research opportunities in this promising research field. + +
+
+ comment: 17 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Leveraging Automatic Personalised Nutrition: Food Image Recognition + Benchmark and Dataset based on Nutrition Taxonomy + + +
+ Leading a healthy lifestyle has become one of the most challenging goals in +today's society due to our sedentary lifestyle and poor eating habits. As a +result, national and international organisms have made numerous efforts to +promote healthier food diets and physical activity habits. However, these +recommendations are sometimes difficult to follow in our daily life and they +are also based on a general population. As a consequence, a new area of +research, personalised nutrition, has been conceived focusing on individual +solutions through smart devices and Artificial Intelligence (AI) methods. + This study presents the AI4Food-NutritionDB database, the first nutrition +database that considers food images and a nutrition taxonomy based on +recommendations by national and international organisms. In addition, four +different categorisation levels are considered following nutrition experts: 6 +nutritional levels, 19 main categories (e.g., "Meat"), 73 subcategories (e.g., +"White Meat"), and 893 final food products (e.g., "Chicken"). The +AI4Food-NutritionDB opens the doors to new food computing approaches in terms +of food intake frequency, quality, and categorisation. Also, in addition to the +database, we propose a standard experimental protocol and benchmark including +three tasks based on the nutrition taxonomy (i.e., category, subcategory, and +final product) to be used for the research community. Finally, we also release +our Deep Learning models trained with the AI4Food-NutritionDB, which can be +used as pre-trained models, achieving accurate recognition results with +challenging food image databases. + +
+
+ comment: 10 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Collaborative Tracking Learning for Frame-Rate-Insensitive Multi-Object + Tracking ICCV 2023 + + +
+ Multi-object tracking (MOT) at low frame rates can reduce computational, +storage and power overhead to better meet the constraints of edge devices. Many +existing MOT methods suffer from significant performance degradation in +low-frame-rate videos due to significant location and appearance changes +between adjacent frames. To this end, we propose to explore collaborative +tracking learning (ColTrack) for frame-rate-insensitive MOT in a query-based +end-to-end manner. Multiple historical queries of the same target jointly track +it with richer temporal descriptions. Meanwhile, we insert an information +refinement module between every two temporal blocking decoders to better fuse +temporal clues and refine features. Moreover, a tracking object consistency +loss is proposed to guide the interaction between historical queries. Extensive +experimental results demonstrate that in high-frame-rate videos, ColTrack +obtains higher performance than state-of-the-art methods on large-scale +datasets Dancetrack and BDD100K, and outperforms the existing end-to-end +methods on MOT17. More importantly, ColTrack has a significant advantage over +state-of-the-art methods in low-frame-rate videos, which allows it to obtain +faster processing speeds by reducing frame-rate requirements while maintaining +higher performance. Code will be released at +https://github.com/yolomax/ColTrack + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ A survey on deep learning in medical image registration: new + technologies, uncertainty, evaluation metrics, and beyond + + +
+ Deep learning technologies have dramatically reshaped the field of medical +image registration over the past decade. The initial developments, such as +ResNet-based and U-Net-based networks, established the foundation for deep +learning in image registration. Subsequent progress has been made in various +aspects of deep learning-based registration, including similarity measures, +deformation regularizations, and uncertainty estimation. These advancements +have not only enriched the field of image registration but have also +facilitated its application in a wide range of tasks, including atlas +construction, multi-atlas segmentation, motion estimation, and 2D-3D +registration. In this paper, we present a comprehensive overview of the most +recent advancements in deep learning-based image registration. We begin with a +concise introduction to the core concepts of deep learning-based image +registration. Then, we delve into innovative network architectures, loss +functions specific to registration, and methods for estimating registration +uncertainty. Additionally, this paper explores appropriate evaluation metrics +for assessing the performance of deep learning models in registration tasks. +Finally, we highlight the practical applications of these novel techniques in +medical imaging and discuss the future prospects of deep learning-based image +registration. + +
+
+
+
+
+ + ♻ ☆ Semantic-Guided Generative Image Augmentation Method with Diffusion + Models for Image Classification + + +
+ Existing image augmentation methods consist of two categories: +perturbation-based methods and generative methods. Perturbation-based methods +apply pre-defined perturbations to augment an original image, but only locally +vary the image, thus lacking image diversity. In contrast, generative methods +bring more image diversity in the augmented images but may not preserve +semantic consistency, thus incorrectly changing the essential semantics of the +original image. To balance image diversity and semantic consistency in +augmented images, we propose SGID, a Semantic-guided Generative Image +augmentation method with Diffusion models for image classification. +Specifically, SGID employs diffusion models to generate augmented images with +good image diversity. More importantly, SGID takes image labels and captions as +guidance to maintain semantic consistency between the augmented and original +images. Experimental results show that SGID outperforms the best augmentation +baseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and +0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image +augmentation baselines and further improves the overall performance. We +demonstrate the semantic consistency and image diversity of SGID through +quantitative human and automated evaluations, as well as qualitative case +studies. + +
+
+ comment: 17 pages, 13 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Neural Fields for Interactive Visualization of Statistical Dependencies + in 3D Simulation Ensembles + + +
+ We present the first neural network that has learned to compactly represent +and can efficiently reconstruct the statistical dependencies between the values +of physical variables at different spatial locations in large 3D simulation +ensembles. Going beyond linear dependencies, we consider mutual information as +a measure of non-linear dependence. We demonstrate learning and reconstruction +with a large weather forecast ensemble comprising 1000 members, each storing +multiple physical variables at a 250 x 352 x 20 simulation grid. By +circumventing compute-intensive statistical estimators at runtime, we +demonstrate significantly reduced memory and computation requirements for +reconstructing the major dependence structures. This enables embedding the +estimator into a GPU-accelerated direct volume renderer and interactively +visualizing all mutual dependencies for a selected domain point. + +
+
+
+
+
+ + ♻ ☆ ShaRPy: Shape Reconstruction and Hand Pose Estimation from RGB-D with + Uncertainty ICCV + + +
+ Despite their potential, markerless hand tracking technologies are not yet +applied in practice to the diagnosis or monitoring of the activity in +inflammatory musculoskeletal diseases. One reason is that the focus of most +methods lies in the reconstruction of coarse, plausible poses, whereas in the +clinical context, accurate, interpretable, and reliable results are required. +Therefore, we propose ShaRPy, the first RGB-D Shape Reconstruction and hand +Pose tracking system, which provides uncertainty estimates of the computed +pose, e.g., when a finger is hidden or its estimate is inconsistent with the +observations in the input, to guide clinical decision-making. Besides pose, +ShaRPy approximates a personalized hand shape, promoting a more realistic and +intuitive understanding of its digital twin. Our method requires only a +light-weight setup with a single consumer-level RGB-D camera yet it is able to +distinguish similar poses with only small joint angle deviations in a +metrically accurate space. This is achieved by combining a data-driven dense +correspondence predictor with traditional energy minimization. To bridge the +gap between interactive visualization and biomedical simulation we leverage a +parametric hand model in which we incorporate biomedical constraints and +optimize for both, its pose and hand shape. We evaluate ShaRPy on a keypoint +detection benchmark and show qualitative results of hand function assessments +for activity monitoring of musculoskeletal diseases. + +
+
+ comment: Accepted at ICCVW (CVAMD) 2023 +
+
+
+
+
+ + ♻ ☆ DASS: Differentiable Architecture Search for Sparse neural networks + + +
+ The deployment of Deep Neural Networks (DNNs) on edge devices is hindered by +the substantial gap between performance requirements and available processing +power. While recent research has made significant strides in developing pruning +methods to build a sparse network for reducing the computing overhead of DNNs, +there remains considerable accuracy loss, especially at high pruning ratios. We +find that the architectures designed for dense networks by differentiable +architecture search methods are ineffective when pruning mechanisms are applied +to them. The main reason is that the current method does not support sparse +architectures in their search space and uses a search objective that is made +for dense networks and does not pay any attention to sparsity. In this paper, +we propose a new method to search for sparsity-friendly neural architectures. +We do this by adding two new sparse operations to the search space and +modifying the search objective. We propose two novel parametric SparseConv and +SparseLinear operations in order to expand the search space to include sparse +operations. In particular, these operations make a flexible search space due to +using sparse parametric versions of linear and convolution operations. The +proposed search objective lets us train the architecture based on the sparsity +of the search space operations. Quantitative analyses demonstrate that our +search architectures outperform those used in the stateof-the-art sparse +networks on the CIFAR-10 and ImageNet datasets. In terms of performance and +hardware effectiveness, DASS increases the accuracy of the sparse version of +MobileNet-v2 from 73.44% to 81.35% (+7.91% improvement) with 3.87x faster +inference time. + +
+
+ comment: 18 pages with 12 figures +
+
+
+
+
+ + ♻ ☆ Learning to Augment: Hallucinating Data for Domain Generalized + Segmentation + + +
+ Domain generalized semantic segmentation (DGSS) is an essential but highly +challenging task, in which the model is trained only on source data and any +target data is not available. Existing DGSS methods primarily standardize the +feature distribution or utilize extra domain data for augmentation. However, +the former sacrifices valuable information and the latter introduces domain +biases. Therefore, generating diverse-style source data without auxiliary data +emerges as an attractive strategy. In light of this, we propose GAN-based +feature augmentation (GBFA) that hallucinates stylized feature maps while +preserving their semantic contents with a feature generator. The impressive +generative capability of GANs enables GBFA to perform inter-channel and +trainable feature synthesis in an end-to-end framework. To enable learning +GBFA, we introduce random image color augmentation (RICA), which adds a diverse +range of variations to source images during training. These augmented images +are then passed through a feature extractor to obtain features tailored for +GBFA training. Both GBFA and RICA operate exclusively within the source domain, +eliminating the need for auxiliary datasets. We conduct extensive experiments, +and the generalization results from the synthetic GTAV and SYNTHIA to the real +Cityscapes, BDDS, and Mapillary datasets show that our method achieves +state-of-the-art performance in DGSS. + +
+
+
+
+
+ + ♻ ☆ ThermRad: A Multi-modal Dataset for Robust 3D Object Detection under + Challenging Conditions + + +
+ Robust 3D object detection in extreme weather and illumination conditions is +a challenging task. While radars and thermal cameras are known for their +resilience to these conditions, few studies have been conducted on +radar-thermal fusion due to the lack of corresponding datasets. To address this +gap, we first present a new multi-modal dataset called ThermRad, which includes +a 3D LiDAR, a 4D radar, an RGB camera and a thermal camera. This dataset is +unique because it includes data from all four sensors in extreme weather +conditions, providing a valuable resource for future research in this area. To +validate the robustness of 4D radars and thermal cameras for 3D object +detection in challenging weather conditions, we propose a new multi-modal +fusion method called RTDF-RCNN, which leverages the complementary strengths of +4D radars and thermal cameras to boost object detection performance. To further +prove the effectiveness of our proposed framework, we re-implement +state-of-the-art (SOTA) 3D detectors on our dataset as benchmarks for +evaluation. Our method achieves significant enhancements in detecting cars, +pedestrians, and cyclists, with improvements of over 7.98%, 24.27%, and 27.15%, +respectively, while achieving comparable results to LiDAR-based approaches. Our +contributions in both the ThermRad dataset and the new multi-modal fusion +method provide a new approach to robust 3D object detection in adverse weather +and illumination conditions. The ThermRad dataset will be released. + +
+
+ comment: At this time, we have not reached a definitive agreement regarding + the ownership and copyright of this dataset. Due to the unresolved issue + regarding the dataset, I am writing to formally request the withdrawal of our + paper +
+
+
+
+
+ + ♻ ☆ BEVTrack: A Simple Baseline for 3D Single Object Tracking in Bird's-Eye + View + + +
+ 3D single object tracking (SOT) in point clouds is still a challenging +problem due to appearance variation, distractors, and high sparsity of point +clouds. Notably, in autonomous driving scenarios, the target object typically +maintains spatial adjacency across consecutive frames, predominantly moving +horizontally. This spatial continuity offers valuable prior knowledge for +target localization. However, existing trackers, which often employ point-wise +representations, struggle to efficiently utilize this knowledge owing to the +irregular format of such representations. Consequently, they require elaborate +designs and solving multiple subtasks to establish spatial correspondence. In +this paper, we introduce BEVTrack, a simple yet strong baseline framework for +3D SOT. After converting consecutive point clouds into the common Bird's-Eye +View representation, BEVTrack inherently encodes spatial proximity and adeptly +captures motion cues for tracking via a simple element-wise operation and +convolutional layers. Additionally, to better deal with objects having diverse +sizes and moving patterns, BEVTrack directly learns the underlying motion +distribution rather than making a fixed Laplacian or Gaussian assumption as in +previous works. Without bells and whistles, BEVTrack achieves state-of-the-art +performance on KITTI and NuScenes datasets while maintaining a high inference +speed of 122 FPS. The code will be released at +https://github.com/xmm-prio/BEVTrack. + +
+
+ comment: Technical report. Work in progress. Typo correction. The code will be + released at https://github.com/xmm-prio/BEVTrack +
+
+
+
+
+ + ♻ ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ StyleDomain: Efficient and Lightweight Parameterizations of StyleGAN for + One-shot and Few-shot Domain Adaptation ICCV 2023 + + +
+ Domain adaptation of GANs is a problem of fine-tuning GAN models pretrained +on a large dataset (e.g. StyleGAN) to a specific domain with few samples (e.g. +painting faces, sketches, etc.). While there are many methods that tackle this +problem in different ways, there are still many important questions that remain +unanswered. In this paper, we provide a systematic and in-depth analysis of the +domain adaptation problem of GANs, focusing on the StyleGAN model. We perform a +detailed exploration of the most important parts of StyleGAN that are +responsible for adapting the generator to a new domain depending on the +similarity between the source and target domains. As a result of this study, we +propose new efficient and lightweight parameterizations of StyleGAN for domain +adaptation. Particularly, we show that there exist directions in StyleSpace +(StyleDomain directions) that are sufficient for adapting to similar domains. +For dissimilar domains, we propose Affine+ and AffineLight+ parameterizations +that allows us to outperform existing baselines in few-shot adaptation while +having significantly less training parameters. Finally, we examine StyleDomain +directions and discover their many surprising properties that we apply for +domain mixing and cross-domain image morphing. Source code can be found at +https://github.com/AIRI-Institute/StyleDomain. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ JL-lemma derived Optimal Projections for Discriminative Dictionary + Learning + + +
+ To overcome difficulties in classifying large dimensionality data with a +large number of classes, we propose a novel approach called JLSPCADL. This +paper uses the Johnson-Lindenstrauss (JL) Lemma to select the dimensionality of +a transformed space in which a discriminative dictionary can be learned for +signal classification. Rather than reducing dimensionality via random +projections, as is often done with JL, we use a projection transformation +matrix derived from Modified Supervised PC Analysis (M-SPCA) with the +JL-prescribed dimension. + JLSPCADL provides a heuristic to deduce suitable distortion levels and the +corresponding Suitable Description Length (SDL) of dictionary atoms to derive +an optimal feature space and thus the SDL of dictionary atoms for better +classification. Unlike state-of-the-art dimensionality reduction-based +dictionary learning methods, a projection transformation matrix derived in a +single step from M-SPCA provides maximum feature-label consistency of the +transformed space while preserving the cluster structure of the original data. +Despite confusing pairs, the dictionary for the transformed space generates +discriminative sparse coefficients, with fewer training samples. +Experimentation demonstrates that JLSPCADL scales well with an increasing +number of classes and dimensionality. Improved label consistency of features +due to M-SPCA helps to classify better. Further, the complexity of training a +discriminative dictionary is significantly reduced by using SDL. +Experimentation on OCR and face recognition datasets shows relatively better +classification performance than other supervised dictionary learning +algorithms. + +
+
+
+
+
+ + ♻ ☆ Biomedical image analysis competitions: The state of current + participation practice + + +
+ The number of international benchmarking competitions is steadily increasing +in various fields of machine learning (ML) research and practice. So far, +however, little is known about the common practice as well as bottlenecks faced +by the community in tackling the research questions posed. To shed light on the +status quo of algorithm development in the specific field of biomedical imaging +analysis, we designed an international survey that was issued to all +participants of challenges conducted in conjunction with the IEEE ISBI 2021 and +MICCAI 2021 conferences (80 competitions in total). The survey covered +participants' expertise and working environments, their chosen strategies, as +well as algorithm characteristics. A median of 72% challenge participants took +part in the survey. According to our results, knowledge exchange was the +primary incentive (70%) for participation, while the reception of prize money +played only a minor role (16%). While a median of 80 working hours was spent on +method development, a large portion of participants stated that they did not +have enough time for method development (32%). 25% perceived the infrastructure +to be a bottleneck. Overall, 94% of all solutions were deep learning-based. Of +these, 84% were based on standard architectures. 43% of the respondents +reported that the data samples (e.g., images) were too large to be processed at +once. This was most commonly addressed by patch-based training (69%), +downsampling (37%), and solving 3D analysis tasks as a series of 2D tasks. +K-fold cross-validation on the training set was performed by only 37% of the +participants and only 50% of the participants performed ensembling based on +multiple identical models (61%) or heterogeneous models (39%). 48% of the +respondents applied postprocessing steps. + +
+
+
+
+
+ + ♻ ☆ Avatar Fingerprinting for Authorized Use of Synthetic Talking-Head + Videos + + +
+ Modern generators render talking-head videos with impressive photorealism, +ushering in new user experiences such as videoconferencing under constrained +bandwidth budgets. Their safe adoption, however, requires a mechanism to verify +if the rendered video is trustworthy. For instance, for videoconferencing we +must identify cases in which a synthetic video portrait uses the appearance of +an individual without their consent. We term this task avatar fingerprinting. +Specifically, we learn an embedding in which the motion signatures of one +identity are grouped together, and pushed away from those of the other +identities. This allows us to link the synthetic video to the identity driving +the expressions in the video, regardless of the facial appearance shown. Avatar +fingerprinting algorithms will be critical as talking head generators become +more ubiquitous, and yet no large scale datasets exist for this new task. +Therefore, we contribute a large dataset of people delivering scripted and +improvised short monologues, accompanied by synthetic videos in which we render +videos of one person using the facial appearance of another. Project page: +https://research.nvidia.com/labs/nxp/avatar-fingerprinting/. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ An Intelligent Remote Sensing Image Quality Inspection System + + +
+ Due to the inevitable presence of quality problems, remote sensing image +quality inspection is indeed an indispensable step between the acquisition and +the application of remote sensing images. However, traditional manual +inspection suffers from low efficiency. Hence, we propose a novel deep +learning-based two-step intelligent system consisting of multiple advanced +computer vision models, which first performs image classification and then +accordingly adopts the most appropriate method, such as semantic segmentation, +to localize the quality problems. Results demonstrate that the proposed method +exhibits excellent performance and efficiency, surpassing traditional methods. +Furthermore, we conduct an initial exploration of applying multimodal models to +remote sensing image quality inspection. + +
+
+
+
+
+ + ♻ ☆ Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free + Continual Learning + + +
+ In this work, we investigate exemplar-free class incremental learning (CIL) +with knowledge distillation (KD) as a regularization strategy, aiming to +prevent forgetting. KD-based methods are successfully used in CIL, but they +often struggle to regularize the model without access to exemplars of the +training data from previous tasks. Our analysis reveals that this issue +originates from substantial representation shifts in the teacher network when +dealing with out-of-distribution data. This causes large errors in the KD loss +component, leading to performance degradation in CIL models. Inspired by recent +test-time adaptation methods, we introduce Teacher Adaptation (TA), a method +that concurrently updates the teacher and the main models during incremental +training. Our method seamlessly integrates with KD-based CIL approaches and +allows for consistent enhancement of their performance across multiple +exemplar-free CIL benchmarks. + +
+
+
+
+
+ + ♻ ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and + Lane Segmentation in Self-Driving Cars + + +
+ Semantic segmentation is a common task in autonomous driving to understand +the surrounding environment. Driveable Area Segmentation and Lane Detection are +particularly important for safe and efficient navigation on the road. However, +original semantic segmentation models are computationally expensive and require +high-end hardware, which is not feasible for embedded systems in autonomous +vehicles. This paper proposes a lightweight model for the driveable area and +lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate +and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K +dataset and compare it with modern models. Experimental results show that our +TwinLiteNet performs similarly to existing approaches, requiring significantly +fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score +of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task +with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000. +Furthermore, TwinLiteNet can run in real-time on embedded devices with limited +computing power, especially since it achieves 60FPS on Jetson Xavier NX, making +it an ideal solution for self-driving vehicles. Code is available: +url{https://github.com/chequanghuy/TwinLiteNet}. + +
+
+ comment: Accepted by MAPR 2023 +
+
+
+
+
+ + ♻ ☆ Improving the matching of deformable objects by learning to detect + keypoints + + +
+ We propose a novel learned keypoint detection method to increase the number +of correct matches for the task of non-rigid image correspondence. By +leveraging true correspondences acquired by matching annotated image pairs with +a specified descriptor extractor, we train an end-to-end convolutional neural +network (CNN) to find keypoint locations that are more appropriate to the +considered descriptor. For that, we apply geometric and photometric warpings to +images to generate a supervisory signal, allowing the optimization of the +detector. Experiments demonstrate that our method enhances the Mean Matching +Accuracy of numerous descriptors when used in conjunction with our detection +method, while outperforming the state-of-the-art keypoint detectors on real +images of non-rigid objects by 20 p.p. We also apply our method on the complex +real-world task of object retrieval where our detector performs on par with the +finest keypoint detectors currently available for this task. The source code +and trained models are publicly available at +https://github.com/verlab/LearningToDetect_PRL_2023 + +
+
+ comment: This is the accepted version of the paper to appear at Pattern + Recognition Letters (PRL). The final journal version will be available at + https://doi.org/10.1016/j.patrec.2023.08.012 +
+
+
+
+
+ + ♻ ☆ A Call to Reflect on Evaluation Practices for Age Estimation: + Comparative Analysis of the State-of-the-Art and a Unified Benchmark + + +
+ Comparing different age estimation methods poses a challenge due to the +unreliability of published results stemming from inconsistencies in the +benchmarking process. Previous studies have reported continuous performance +improvements over the past decade using specialized methods; however, our +findings challenge these claims. This paper identifies two trivial, yet +persistent issues with the currently used evaluation protocol and describes how +to resolve them. We describe our evaluation protocol in detail and provide +specific examples of how the protocol should be used. We utilize the protocol +to offer an extensive comparative analysis for state-of-the-art facial age +estimation methods. Surprisingly, we find that the performance differences +between the methods are negligible compared to the effect of other factors, +such as facial alignment, facial coverage, image resolution, model +architecture, or the amount of data used for pretraining. We use the gained +insights to propose using FaRL as the backbone model and demonstrate its +efficiency. The results emphasize the importance of consistent data +preprocessing practices for reliable and meaningful comparisons. We make our +source code public at +https://github.com/paplhjak/Facial-Age-Estimation-Benchmark. + +
+
+ comment: Revised version +
+
+
+
+
+ + ♻ ☆ MFPNet: Multi-scale Feature Propagation Network For Lightweight Semantic + Segmentation + + +
+ In contrast to the abundant research focusing on large-scale models, the +progress in lightweight semantic segmentation appears to be advancing at a +comparatively slower pace. However, existing compact methods often suffer from +limited feature representation capability due to the shallowness of their +networks. In this paper, we propose a novel lightweight segmentation +architecture, called Multi-scale Feature Propagation Network (MFPNet), to +address the dilemma. Specifically, we design a robust Encoder-Decoder structure +featuring symmetrical residual blocks that consist of flexible bottleneck +residual modules (BRMs) to explore deep and rich muti-scale semantic context. +Furthermore, taking benefit from their capacity to model latent long-range +contextual relationships, we leverage Graph Convolutional Networks (GCNs) to +facilitate multi-scale feature propagation between the BRM blocks. When +evaluated on benchmark datasets, our proposed approach shows superior +segmentation results. + +
+
+ comment: 5 pages, 3 figures, 5tables, conference +
+
+
+
+
+ + ♻ ☆ Implicit Neural Image Stitching With Enhanced and Blended Feature + Reconstruction + + +
+ Existing frameworks for image stitching often provide visually reasonable +stitchings. However, they suffer from blurry artifacts and disparities in +illumination, depth level, etc. Although the recent learning-based stitchings +relax such disparities, the required methods impose sacrifice of image +qualities failing to capture high-frequency details for stitched images. To +address the problem, we propose a novel approach, implicit Neural Image +Stitching (NIS) that extends arbitrary-scale super-resolution. Our method +estimates Fourier coefficients of images for quality-enhancing warps. Then, the +suggested model blends color mismatches and misalignment in the latent space +and decodes the features into RGB values of stitched images. Our experiments +show that our approach achieves improvement in resolving the low-definition +imaging of the previous deep image stitching with favorable accelerated +image-enhancing methods. Our source code is available at +https://github.com/minshu-kim/NIS. + +
+
+
+
+
+ + ♻ ☆ Interaction Visual Transformer for Egocentric Action Anticipation + + +
+ Human-object interaction is one of the most important visual cues and we +propose a novel way to represent human-object interactions for egocentric +action anticipation. We propose a novel transformer variant to model +interactions by computing the change in the appearance of objects and human +hands due to the execution of the actions and use those changes to refine the +video representation. Specifically, we model interactions between hands and +objects using Spatial Cross-Attention (SCA) and further infuse contextual +information using Trajectory Cross-Attention to obtain environment-refined +interaction tokens. Using these tokens, we construct an interaction-centric +video representation for action anticipation. We term our model InAViT which +achieves state-of-the-art action anticipation performance on large-scale +egocentric datasets EPICKTICHENS100 (EK100) and EGTEA Gaze+. InAViT outperforms +other visual transformer-based methods including object-centric video +representation. On the EK100 evaluation server, InAViT is the top-performing +method on the public leaderboard (at the time of submission) where it +outperforms the second-best model by 3.3% on mean-top5 recall. + +
+
+ comment: Top of the public leaderboard on EK100 Action Anticipation + https://codalab.lisn.upsaclay.fr/competitions/702#results +
+
+
+
+
+ + ♻ ☆ Robot Parkour Learning + + +
+ Parkour is a grand challenge for legged locomotion that requires robots to +overcome various obstacles rapidly in complex environments. Existing methods +can generate either diverse but blind locomotion skills or vision-based but +specialized skills by using reference animal data or complex rewards. However, +autonomous parkour requires robots to learn generalizable skills that are both +vision-based and diverse to perceive and react to various scenarios. In this +work, we propose a system for learning a single end-to-end vision-based parkour +policy of diverse parkour skills using a simple reward without any reference +motion data. We develop a reinforcement learning method inspired by direct +collocation to generate parkour skills, including climbing over high obstacles, +leaping over large gaps, crawling beneath low barriers, squeezing through thin +slits, and running. We distill these skills into a single vision-based parkour +policy and transfer it to a quadrupedal robot using its egocentric depth +camera. We demonstrate that our system can empower two different low-cost +robots to autonomously select and execute appropriate parkour skills to +traverse challenging real-world environments. + +
+
+ comment: CoRL 2023 (Oral). Project website at https://robot-parkour.github.io +
+
+
+
+
+ + ♻ ☆ Reliable Joint Segmentation of Retinal Edema Lesions in OCT Images + + +
+ Focusing on the complicated pathological features, such as blurred +boundaries, severe scale differences between symptoms, background noise +interference, etc., in the task of retinal edema lesions joint segmentation +from OCT images and enabling the segmentation results more reliable. In this +paper, we propose a novel reliable multi-scale wavelet-enhanced transformer +network, which can provide accurate segmentation results with reliability +assessment. Specifically, aiming at improving the model's ability to learn the +complex pathological features of retinal edema lesions in OCT images, we +develop a novel segmentation backbone that integrates a wavelet-enhanced +feature extractor network and a multi-scale transformer module of our newly +designed. Meanwhile, to make the segmentation results more reliable, a novel +uncertainty segmentation head based on the subjective logical evidential theory +is introduced to generate the final segmentation results with a corresponding +overall uncertainty evaluation score map. We conduct comprehensive experiments +on the public database of AI-Challenge 2018 for retinal edema lesions +segmentation, and the results show that our proposed method achieves better +segmentation accuracy with a high degree of reliability as compared to other +state-of-the-art segmentation approaches. The code will be released on: +https://github.com/LooKing9218/ReliableRESeg. + +
+
+
+
+
+ + ♻ ☆ IBL-NeRF: Image-Based Lighting Formulation of Neural Radiance Fields + + +
+ We propose IBL-NeRF, which decomposes the neural radiance fields (NeRF) of +large-scale indoor scenes into intrinsic components. Recent approaches further +decompose the baked radiance of the implicit volume into intrinsic components +such that one can partially approximate the rendering equation. However, they +are limited to representing isolated objects with a shared environment +lighting, and suffer from computational burden to aggregate rays with Monte +Carlo integration. In contrast, our prefiltered radiance field extends the +original NeRF formulation to capture the spatial variation of lighting within +the scene volume, in addition to surface properties. Specifically, the scenes +of diverse materials are decomposed into intrinsic components for rendering, +namely, albedo, roughness, surface normal, irradiance, and prefiltered +radiance. All of the components are inferred as neural images from MLP, which +can model large-scale general scenes. Especially the prefiltered radiance +effectively models the volumetric light field, and captures spatial variation +beyond a single environment light. The prefiltering aggregates rays in a set of +predefined neighborhood sizes such that we can replace the costly Monte Carlo +integration of global illumination with a simple query from a neural image. By +adopting NeRF, our approach inherits superior visual quality and multi-view +consistency for synthesized images as well as the intrinsic components. We +demonstrate the performance on scenes with complex object layouts and light +configurations, which could not be processed in any of the previous works. + +
+
+ comment: Computer Graphics Forum (Pacific Graphics 2023) +
+
+
+
+
+ + ♻ ☆ Recursive Cross-View: Use Only 2D Detectors to Achieve 3D Object + Detection without 3D Annotations + + +
+ Heavily relying on 3D annotations limits the real-world application of 3D +object detection. In this paper, we propose a method that does not demand any +3D annotation, while being able to predict fully oriented 3D bounding boxes. +Our method, called Recursive Cross-View (RCV), utilizes the three-view +principle to convert 3D detection into multiple 2D detection tasks, requiring +only a subset of 2D labels. We propose a recursive paradigm, in which instance +segmentation and 3D bounding box generation by Cross-View are implemented +recursively until convergence. Specifically, our proposed method involves the +use of a frustum for each 2D bounding box, which is then followed by the +recursive paradigm that ultimately generates a fully oriented 3D box, along +with its corresponding class and score. Note that, class and score are given by +the 2D detector. Estimated on the SUN RGB-D and KITTI datasets, our method +outperforms existing image-based approaches. To justify that our method can be +quickly used to new tasks, we implement it on two real-world scenarios, namely +3D human detection and 3D hand detection. As a result, two new 3D annotated +datasets are obtained, which means that RCV can be viewed as a (semi-) +automatic 3D annotator. Furthermore, we deploy RCV on a depth sensor, which +achieves detection at 7 fps on a live RGB-D stream. RCV is the first 3D +detection method that yields fully oriented 3D boxes without consuming 3D +labels. + +
+
+ comment: Accepted by R-AL +
+
+
+
+
+ + ♻ ☆ 4D Panoptic Segmentation as Invariant and Equivariant Field Prediction ICCV 2023 + + +
+ In this paper, we develop rotation-equivariant neural networks for 4D +panoptic segmentation. 4D panoptic segmentation is a benchmark task for +autonomous driving that requires recognizing semantic classes and object +instances on the road based on LiDAR scans, as well as assigning temporally +consistent IDs to instances across time. We observe that the driving scenario +is symmetric to rotations on the ground plane. Therefore, rotation-equivariance +could provide better generalization and more robust feature learning. +Specifically, we review the object instance clustering strategies and restate +the centerness-based approach and the offset-based approach as the prediction +of invariant scalar fields and equivariant vector fields. Other sub-tasks are +also unified from this perspective, and different invariant and equivariant +layers are designed to facilitate their predictions. Through evaluation on the +standard 4D panoptic segmentation benchmark of SemanticKITTI, we show that our +equivariant models achieve higher accuracy with lower computational costs +compared to their non-equivariant counterparts. Moreover, our method sets the +new state-of-the-art performance and achieves 1st place on the SemanticKITTI 4D +Panoptic Segmentation leaderboard. + +
+
+ comment: 13 pages. Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MAMo: Leveraging Memory and Attention for Monocular Video Depth + Estimation ICCV 2023 + + +
+ We propose MAMo, a novel memory and attention frame-work for monocular video +depth estimation. MAMo can augment and improve any single-image depth +estimation networks into video depth estimation models, enabling them to take +advantage of the temporal information to predict more accurate depth. In MAMo, +we augment model with memory which aids the depth prediction as the model +streams through the video. Specifically, the memory stores learned visual and +displacement tokens of the previous time instances. This allows the depth +network to cross-reference relevant features from the past when predicting +depth on the current frame. We introduce a novel scheme to continuously update +the memory, optimizing it to keep tokens that correspond with both the past and +the present visual information. We adopt attention-based approach to process +memory features where we first learn the spatio-temporal relation among the +resultant visual and displacement memory tokens using self-attention module. +Further, the output features of self-attention are aggregated with the current +visual features through cross-attention. The cross-attended features are +finally given to a decoder to predict depth on the current frame. Through +extensive experiments on several benchmarks, including KITTI, NYU-Depth V2, and +DDAD, we show that MAMo consistently improves monocular depth estimation +networks and sets new state-of-the-art (SOTA) accuracy. Notably, our MAMo video +depth estimation provides higher accuracy with lower latency, when omparing to +SOTA cost-volume-based video depth models. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Snipper: A Spatiotemporal Transformer for Simultaneous Multi-Person 3D + Pose Estimation Tracking and Forecasting on a Video Snippet + + +
+ Multi-person pose understanding from RGB videos involves three complex tasks: +pose estimation, tracking and motion forecasting. Intuitively, accurate +multi-person pose estimation facilitates robust tracking, and robust tracking +builds crucial history for correct motion forecasting. Most existing works +either focus on a single task or employ multi-stage approaches to solving +multiple tasks separately, which tends to make sub-optimal decision at each +stage and also fail to exploit correlations among the three tasks. In this +paper, we propose Snipper, a unified framework to perform multi-person 3D pose +estimation, tracking, and motion forecasting simultaneously in a single stage. +We propose an efficient yet powerful deformable attention mechanism to +aggregate spatiotemporal information from the video snippet. Building upon this +deformable attention, a video transformer is learned to encode the +spatiotemporal features from the multi-frame snippet and to decode informative +pose features for multi-person pose queries. Finally, these pose queries are +regressed to predict multi-person pose trajectories and future motions in a +single shot. In the experiments, we show the effectiveness of Snipper on three +challenging public datasets where our generic model rivals specialized +state-of-art baselines for pose estimation, tracking, and forecasting. + +
+
+
+
+
+ + ♻ ☆ Event-based Stereo Visual Odometry with Native Temporal Resolution via + Continuous-time Gaussian Process Regression + + +
+ Event-based cameras asynchronously capture individual visual changes in a +scene. This makes them more robust than traditional frame-based cameras to +highly dynamic motions and poor illumination. It also means that every +measurement in a scene can occur at a unique time. + Handling these different measurement times is a major challenge of using +event-based cameras. It is often addressed in visual odometry (VO) pipelines by +approximating temporally close measurements as occurring at one common time. +This grouping simplifies the estimation problem but, absent additional sensors, +sacrifices the inherent temporal resolution of event-based cameras. + This paper instead presents a complete stereo VO pipeline that estimates +directly with individual event-measurement times without requiring any grouping +or approximation in the estimation state. It uses continuous-time trajectory +estimation to maintain the temporal fidelity and asynchronous nature of +event-based cameras through Gaussian process regression with a physically +motivated prior. Its performance is evaluated on the MVSEC dataset, where it +achieves 7.9e-3 and 5.9e-3 RMS relative error on two independent sequences, +outperforming the existing publicly available event-based stereo VO pipeline by +two and four times, respectively. + +
+
+ comment: 8 pages, 4 figures. DOI: 10.1109/LRA.2023.3311374 +
+
+
+
+
+ + ♻ ☆ Confidence Intervals for Error Rates in 1:1 Matching Tasks: Critical + Statistical Analysis and Recommendations + + +
+ Matching algorithms are commonly used to predict matches between items in a +collection. For example, in 1:1 face verification, a matching algorithm +predicts whether two face images depict the same person. Accurately assessing +the uncertainty of the error rates of such algorithms can be challenging when +data are dependent and error rates are low, two aspects that have been often +overlooked in the literature. In this work, we review methods for constructing +confidence intervals for error rates in 1:1 matching tasks. We derive and +examine the statistical properties of these methods, demonstrating how coverage +and interval width vary with sample size, error rates, and degree of data +dependence on both analysis and experiments with synthetic and real-world +datasets. Based on our findings, we provide recommendations for best practices +for constructing confidence intervals for error rates in 1:1 matching tasks. + +
+
+ comment: 32 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ LaCViT: A Label-aware Contrastive Training Framework for Vision + Transformers + + +
+ Vision Transformers have been incredibly effective when tackling computer +vision tasks due to their ability to model long feature dependencies. By using +large-scale training data and various self-supervised signals (e.g., masked +random patches), vision transformers provide state-of-the-art performance on +several benchmarking datasets, such as ImageNet-1k and CIFAR-10. However, these +vision transformers pretrained over general large-scale image corpora could +only produce an anisotropic representation space, limiting their +generalizability and transferability to the target downstream tasks. In this +paper, we propose a simple and effective Label-aware Contrastive Training +framework LaCViT, which improves the isotropy of the pretrained representation +space for vision transformers, thereby enabling more effective transfer +learning amongst a wide range of image classification tasks. Through +experimentation over five standard image classification datasets, we +demonstrate that LaCViT-trained models outperform the original pretrained +baselines by around 9% absolute Accuracy@1, and consistent improvements can be +observed when applying LaCViT to our three evaluated vision transformers. + +
+
+
+
+
+ + ♻ ☆ Exploring the Versatility of Zero-Shot CLIP for Interstitial Lung + Disease Classification + + +
+ Interstitial lung diseases (ILD) present diagnostic challenges due to their +varied manifestations and overlapping imaging features. To address this, we +propose a machine learning approach that utilizes CLIP, a multimodal (image and +text) self-supervised model, for ILD classification. We extensively integrate +zero-shot CLIP throughout our workflow, starting from the initial extraction of +image patches from volumetric CT scans and proceeding to ILD classification +using "patch montages". Furthermore, we investigate how domain adaptive +pretraining (DAPT) CLIP with task-specific images (CT "patch montages" +extracted with ILD-specific prompts for CLIP) and/or text (lung-specific +sections of radiology reports) affects downstream ILD classification +performance. By leveraging CLIP-extracted "patch montages" and DAPT, we achieve +strong zero-shot ILD classification results, including an AUROC of 0.893, +without the need for any labeled training data. This work highlights the +versatility and potential of multimodal models like CLIP for medical image +classification tasks where labeled data is scarce. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Project Aria: A New Tool for Egocentric Multi-Modal AI Research + + +
+ Egocentric, multi-modal data as available on future augmented reality (AR) +devices provides unique challenges and opportunities for machine perception. +These future devices will need to be all-day wearable in a socially acceptable +form-factor to support always available, context-aware and personalized AI +applications. Our team at Meta Reality Labs Research built the Aria device, an +egocentric, multi-modal data recording and streaming device with the goal to +foster and accelerate research in this area. In this paper, we describe the +Aria device hardware including its sensor configuration and the corresponding +software tools that enable recording and processing of such data. + +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, EDM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. +The code is available at https://github.com/forever208/ADM-ES; +https://github.com/forever208/EDM-ES + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ MultiWay-Adapater: Adapting large-scale multi-modal models for scalable + image-text retrieval + + +
+ As the size of Large Multi-Modal Models (LMMs) increases consistently, the +adaptation of these pre-trained models to specialized tasks has become a +computationally and memory-intensive challenge. Traditional fine-tuning methods +require isolated, exhaustive retuning for each new task, limiting the models' +versatility. Moreover, current efficient adaptation techniques often overlook +modality alignment, focusing only on the knowledge extraction of new tasks. To +tackle these issues, we introduce Multiway-Adapter, an innovative framework +incorporating an 'Alignment Enhancer' to deepen modality alignment, enabling +high transferability without tuning pre-trained parameters. Our method adds +fewer than 1.25\% of additional parameters to LMMs, exemplified by the BEiT-3 +model in our study. This leads to superior zero-shot image-text retrieval +performance compared to fully fine-tuned models, while achieving up to a 57\% +reduction in fine-tuning time. Our approach offers a resource-efficient and +effective adaptation pathway for LMMs, broadening their applicability. The +source code is publicly available at: +\url{https://github.com/longkukuhi/MultiWay-Adapter}. + +
+
+
+
+
+ + ♻ ☆ Neural Étendue Expander for Ultra-Wide-Angle High-Fidelity + Holographic Display + + +
+ Holographic displays can generate light fields by dynamically modulating the +wavefront of a coherent beam of light using a spatial light modulator, +promising rich virtual and augmented reality applications. However, the limited +spatial resolution of existing dynamic spatial light modulators imposes a tight +bound on the diffraction angle. As a result, modern holographic displays +possess low \'{e}tendue, which is the product of the display area and the +maximum solid angle of diffracted light. The low \'{e}tendue forces a sacrifice +of either the field-of-view (FOV) or the display size. In this work, we lift +this limitation by presenting neural \'{e}tendue expanders. This new breed of +optical elements, which is learned from a natural image dataset, enables higher +diffraction angles for ultra-wide FOV while maintaining both a compact form +factor and the fidelity of displayed contents to human viewers. With neural +\'{e}tendue expanders, we experimentally achieve 64$\times$ \'{e}tendue +expansion of natural images in full color, expanding the FOV by an order of +magnitude horizontally and vertically, with high-fidelity reconstruction +quality (measured in PSNR) over 29 dB on retinal-resolution images. + +
+
+
+
+
+ + ♻ ☆ Radial Prediction Domain Adaption Classifier for the MIDOG 2022 + Challenge + + +
+ This paper describes our contribution to the MIDOG 2022 challenge for +detecting mitotic cells. One of the major problems to be addressed in the MIDOG +2022 challenge is the robustness under the natural variance that appears for +real-life data in the histopathology field. To address the problem, we use an +adapted YOLOv5s model for object detection in conjunction with a new Domain +Adaption Classifier (DAC) variant, the Radial-Prediction-DAC, to achieve +robustness under domain shifts. In addition, we increase the variability of the +available training data using stain augmentation in HED color space. Using the +suggested method, we obtain a test set F1-score of 0.6658. + +
+
+ comment: Contribution to the MIDOG-2022-Challenge +
+
+
+
+
+ + ♻ ☆ Few-View Object Reconstruction with Unknown Categories and Camera Poses + + +
+ While object reconstruction has made great strides in recent years, current +methods typically require densely captured images and/or known camera poses, +and generalize poorly to novel object categories. To step toward object +reconstruction in the wild, this work explores reconstructing general +real-world objects from a few images without known camera poses or object +categories. The crux of our work is solving two fundamental 3D vision problems +-- shape reconstruction and pose estimation -- in a unified approach. Our +approach captures the synergies of these two problems: reliable camera pose +estimation gives rise to accurate shape reconstruction, and the accurate +reconstruction, in turn, induces robust correspondence between different views +and facilitates pose estimation. Our method FORGE predicts 3D features from +each view and leverages them in conjunction with the input images to establish +cross-view correspondence for estimating relative camera poses. The 3D features +are then transformed by the estimated poses into a shared space and are fused +into a neural radiance field. The reconstruction results are rendered by volume +rendering techniques, enabling us to train the model without 3D shape +ground-truth. Our experiments show that FORGE reliably reconstructs objects +from five views. Our pose estimation method outperforms existing ones by a +large margin. The reconstruction results under predicted poses are comparable +to the ones using ground-truth poses. The performance on novel testing +categories matches the results on categories seen during training. Project +page: https://ut-austin-rpl.github.io/FORGE/ + +
+
+
+
+
+ + ♻ ☆ Transformer-based model for monocular visual odometry: a video + understanding approach + + +
+ Estimating the camera's pose given images of a single camera is a traditional +task in mobile robots and autonomous vehicles. This problem is called monocular +visual odometry and it often relies on geometric approaches that require +considerable engineering effort for a specific scenario. Deep learning methods +have shown to be generalizable after proper training and a large amount of +available data. Transformer-based architectures have dominated the +state-of-the-art in natural language processing and computer vision tasks, such +as image and video understanding. In this work, we deal with the monocular +visual odometry as a video understanding task to estimate the 6-DoF camera's +pose. We contribute by presenting the TSformer-VO model based on +spatio-temporal self-attention mechanisms to extract features from clips and +estimate the motions in an end-to-end manner. Our approach achieved competitive +state-of-the-art performance compared with geometry-based and deep +learning-based methods on the KITTI visual odometry dataset, outperforming the +DeepVO implementation highly accepted in the visual odometry community. + +
+
+
+
+
+ + ♻ ☆ Findings of Factify 2: Multimodal Fake News Detection AAAI 2023 + + +
+ With social media usage growing exponentially in the past few years, fake +news has also become extremely prevalent. The detrimental impact of fake news +emphasizes the need for research focused on automating the detection of false +information and verifying its accuracy. In this work, we present the outcome of +the Factify 2 shared task, which provides a multi-modal fact verification and +satire news dataset, as part of the DeFactify 2 workshop at AAAI'23. The data +calls for a comparison based approach to the task by pairing social media +claims with supporting documents, with both text and image, divided into 5 +classes based on multi-modal relations. In the second iteration of this task we +had over 60 participants and 9 final test-set submissions. The best +performances came from the use of DeBERTa for text and Swinv2 and CLIP for +image. The highest F1 score averaged for all five classes was 81.82%. + +
+
+ comment: Defactify2 @AAAI 2023 +
+
+
+
+
+ + ♻ ☆ Unlocking Fine-Grained Details with Wavelet-based High-Frequency + Enhancement in Transformers MICCAI 2023 + + +
+ Medical image segmentation is a critical task that plays a vital role in +diagnosis, treatment planning, and disease monitoring. Accurate segmentation of +anatomical structures and abnormalities from medical images can aid in the +early detection and treatment of various diseases. In this paper, we address +the local feature deficiency of the Transformer model by carefully re-designing +the self-attention map to produce accurate dense prediction in medical images. +To this end, we first apply the wavelet transformation to decompose the input +feature map into low-frequency (LF) and high-frequency (HF) subbands. The LF +segment is associated with coarse-grained features while the HF components +preserve fine-grained features such as texture and edge information. Next, we +reformulate the self-attention operation using the efficient Transformer to +perform both spatial and context attention on top of the frequency +representation. Furthermore, to intensify the importance of the boundary +information, we impose an additional attention map by creating a Gaussian +pyramid on top of the HF components. Moreover, we propose a multi-scale context +enhancement block within skip connections to adaptively model inter-scale +dependencies to overcome the semantic gap among stages of the encoder and +decoder modules. Throughout comprehensive experiments, we demonstrate the +effectiveness of our strategy on multi-organ and skin lesion segmentation +benchmarks. The implementation code will be available upon acceptance. +\href{https://github.com/mindflow-institue/WaveFormer}{GitHub}. + +
+
+ comment: Accepted in MICCAI 2023 workshop MLMI +
+
+
+
+
+ + ♻ ☆ Super-Resolution Surface Reconstruction from Few Low-Resolution Slices + + +
+ In many imaging applications where segmented features (e.g. blood vessels) +are further used for other numerical simulations (e.g. finite element +analysis), the obtained surfaces do not have fine resolutions suitable for the +task. Increasing the resolution of such surfaces becomes crucial. This paper +proposes a new variational model for solving this problem, based on an +Euler-Elastica-based regulariser. Further, we propose and implement two +numerical algorithms for solving the model, a projected gradient descent method +and the alternating direction method of multipliers. Numerical experiments +using real-life examples (including two from outputs of another variational +model) have been illustrated for effectiveness. The advantages of the new model +are shown through quantitative comparisons by the standard deviation of +Gaussian curvatures and mean curvatures from the viewpoint of discrete +geometry. + +
+
+ comment: 33 pages, 25 figures +
+
+
+
+
+ + ♻ ☆ Mask2Anomaly: Mask Transformer for Universal Open-set Segmentation + + +
+ Segmenting unknown or anomalous object instances is a critical task in +autonomous driving applications, and it is approached traditionally as a +per-pixel classification problem. However, reasoning individually about each +pixel without considering their contextual semantics results in high +uncertainty around the objects' boundaries and numerous false positives. We +propose a paradigm change by shifting from a per-pixel classification to a mask +classification. Our mask-based method, Mask2Anomaly, demonstrates the +feasibility of integrating a mask-classification architecture to jointly +address anomaly segmentation, open-set semantic segmentation, and open-set +panoptic segmentation. Mask2Anomaly includes several technical novelties that +are designed to improve the detection of anomalies/unknown objects: i) a global +masked attention module to focus individually on the foreground and background +regions; ii) a mask contrastive learning that maximizes the margin between an +anomaly and known classes; iii) a mask refinement solution to reduce false +positives; and iv) a novel approach to mine unknown instances based on the +mask-architecture properties. By comprehensive qualitative and qualitative +evaluation, we show Mask2Anomaly achieves new state-of-the-art results across +the benchmarks of anomaly segmentation, open-set semantic segmentation, and +open-set panoptic segmentation. + +
+
+ comment: 16 pages. arXiv admin note: substantial text overlap with + arXiv:2307.13316 +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Human Action Co-occurrence in Lifestyle Vlogs using Graph Link + Prediction + + +
+ We introduce the task of automatic human action co-occurrence identification, +i.e., determine whether two human actions can co-occur in the same interval of +time. We create and make publicly available the ACE (Action Co-occurrencE) +dataset, consisting of a large graph of ~12k co-occurring pairs of visual +actions and their corresponding video clips. We describe graph link prediction +models that leverage visual and textual information to automatically infer if +two actions are co-occurring. We show that graphs are particularly well suited +to capture relations between human actions, and the learned graph +representations are effective for our task and capture novel and relevant +information across different data domains. The ACE dataset and the code +introduced in this paper are publicly available at +https://github.com/MichiganNLP/vlog_action_co-occurrence. + +
+
+
+
+
+ + ☆ HAMUR: Hyper Adapter for Multi-Domain Recommendation CIKM'2023 + + +
+ Multi-Domain Recommendation (MDR) has gained significant attention in recent +years, which leverages data from multiple domains to enhance their performance +concurrently.However, current MDR models are confronted with two limitations. +Firstly, the majority of these models adopt an approach that explicitly shares +parameters between domains, leading to mutual interference among them. +Secondly, due to the distribution differences among domains, the utilization of +static parameters in existing methods limits their flexibility to adapt to +diverse domains. To address these challenges, we propose a novel model Hyper +Adapter for Multi-Domain Recommendation (HAMUR). Specifically, HAMUR consists +of two components: (1). Domain-specific adapter, designed as a pluggable module +that can be seamlessly integrated into various existing multi-domain backbone +models, and (2). Domain-shared hyper-network, which implicitly captures shared +information among domains and dynamically generates the parameters for the +adapter. We conduct extensive experiments on two public datasets using various +backbone networks. The experimental results validate the effectiveness and +scalability of the proposed model. + +
+
+ comment: Accepted by CIKM'2023 +
+
+
+
+
+ + ☆ Improving and Evaluating the Detection of Fragmentation in News + Recommendations with the Clustering of News Story Chains RecSys 2023 + + +
+ News recommender systems play an increasingly influential role in shaping +information access within democratic societies. However, tailoring +recommendations to users' specific interests can result in the divergence of +information streams. Fragmented access to information poses challenges to the +integrity of the public sphere, thereby influencing democracy and public +discourse. The Fragmentation metric quantifies the degree of fragmentation of +information streams in news recommendations. Accurate measurement of this +metric requires the application of Natural Language Processing (NLP) to +identify distinct news events, stories, or timelines. This paper presents an +extensive investigation of various approaches for quantifying Fragmentation in +news recommendations. These approaches are evaluated both intrinsically, by +measuring performance on news story clustering, and extrinsically, by assessing +the Fragmentation scores of different simulated news recommender scenarios. Our +findings demonstrate that agglomerative hierarchical clustering coupled with +SentenceBERT text representation is substantially better at detecting +Fragmentation than earlier implementations. Additionally, the analysis of +simulated scenarios yields valuable insights and recommendations for +stakeholders concerning the measurement and interpretation of Fragmentation. + +
+
+ comment: Cite published version: Polimeno et. al., Improving and Evaluating + the Detection of Fragmentation in News Recommendations with the Clustering of + News Story Chains, NORMalize 2023: The First Workshop on the Normative Design + and Evaluation of Recommender Systems, September 19, 2023, co-located with + the ACM Conference on Recommender Systems 2023 (RecSys 2023), Singapore +
+
+
+
+
+ + ☆ AKEM: Aligning Knowledge Base to Queries with Ensemble Model for Entity + Recognition and Linking + + +
+ This paper presents a novel approach to address the Entity Recognition and +Linking Challenge at NLPCC 2015. The task involves extracting named entity +mentions from short search queries and linking them to entities within a +reference Chinese knowledge base. To tackle this problem, we first expand the +existing knowledge base and utilize external knowledge to identify candidate +entities, thereby improving the recall rate. Next, we extract features from the +candidate entities and utilize Support Vector Regression and Multiple Additive +Regression Tree as scoring functions to filter the results. Additionally, we +apply rules to further refine the results and enhance precision. Our method is +computationally efficient and achieves an F1 score of 0.535. + +
+
+
+
+
+ + ☆ Annotating Data for Fine-Tuning a Neural Ranker? Current Active Learning + Strategies are not Better than Random Selection SIGIR + + +
+ Search methods based on Pretrained Language Models (PLM) have demonstrated +great effectiveness gains compared to statistical and early neural ranking +models. However, fine-tuning PLM-based rankers requires a great amount of +annotated training data. Annotating data involves a large manual effort and +thus is expensive, especially in domain specific tasks. In this paper we +investigate fine-tuning PLM-based rankers under limited training data and +budget. We investigate two scenarios: fine-tuning a ranker from scratch, and +domain adaptation starting with a ranker already fine-tuned on general data, +and continuing fine-tuning on a target dataset. We observe a great variability +in effectiveness when fine-tuning on different randomly selected subsets of +training data. This suggests that it is possible to achieve effectiveness gains +by actively selecting a subset of the training data that has the most positive +effect on the rankers. This way, it would be possible to fine-tune effective +PLM rankers at a reduced annotation budget. To investigate this, we adapt +existing Active Learning (AL) strategies to the task of fine-tuning PLM rankers +and investigate their effectiveness, also considering annotation and +computational costs. Our extensive analysis shows that AL strategies do not +significantly outperform random selection of training subsets in terms of +effectiveness. We further find that gains provided by AL strategies come at the +expense of more assessments (thus higher annotation costs) and AL strategies +underperform random selection when comparing effectiveness given a fixed +annotation cost. Our results highlight that ``optimal'' subsets of training +data that provide high effectiveness at low annotation cost do exist, but +current mainstream AL strategies applied to PLM rankers are not capable of +identifying them. + +
+
+ comment: Accepted at SIGIR-AP 2023 +
+
+
+
+
+ + ☆ Characterizing Latent Perspectives of Media Houses Towards Public + Figures + + +
+ Media houses reporting on public figures, often come with their own biases +stemming from their respective worldviews. A characterization of these +underlying patterns helps us in better understanding and interpreting news +stories. For this, we need diverse or subjective summarizations, which may not +be amenable for classifying into predefined class labels. This work proposes a +zero-shot approach for non-extractive or generative characterizations of person +entities from a corpus using GPT-2. We use well-articulated articles from +several well-known news media houses as a corpus to build a sound argument for +this approach. First, we fine-tune a GPT-2 pre-trained language model with a +corpus where specific person entities are characterized. Second, we further +fine-tune this with demonstrations of person entity characterizations, created +from a corpus of programmatically constructed characterizations. This twice +fine-tuned model is primed with manual prompts consisting of entity names that +were not previously encountered in the second fine-tuning, to generate a simple +sentence about the entity. The results were encouraging, when compared against +actual characterizations from the corpus. + +
+
+
+
+
+ + ☆ Evaluating the Ebb and Flow: An In-depth Analysis of Question-Answering + Trends across Diverse Platforms + + +
+ Community Question Answering (CQA) platforms steadily gain popularity as they +provide users with fast responses to their queries. The swiftness of these +responses is contingent on a mixture of query-specific and user-related +elements. This paper scrutinizes these contributing factors within the context +of six highly popular CQA platforms, identified through their standout +answering speed. Our investigation reveals a correlation between the time taken +to yield the first response to a question and several variables: the metadata, +the formulation of the questions, and the level of interaction among users. +Additionally, by employing conventional machine learning models to analyze +these metadata and patterns of user interaction, we endeavor to predict which +queries will receive their initial responses promptly. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GLAD: Content-aware Dynamic Graphs For Log Anomaly Detection + + +
+ Logs play a crucial role in system monitoring and debugging by recording +valuable system information, including events and states. Although various +methods have been proposed to detect anomalies in log sequences, they often +overlook the significance of considering relations among system components, +such as services and users, which can be identified from log contents. +Understanding these relations is vital for detecting anomalies and their +underlying causes. To address this issue, we introduce GLAD, a Graph-based Log +Anomaly Detection framework designed to detect relational anomalies in system +logs. GLAD incorporates log semantics, relational patterns, and sequential +patterns into a unified framework for anomaly detection. Specifically, GLAD +first introduces a field extraction module that utilizes prompt-based few-shot +learning to identify essential fields from log contents. Then GLAD constructs +dynamic log graphs for sliding windows by interconnecting extracted fields and +log events parsed from the log parser. These graphs represent events and fields +as nodes and their relations as edges. Subsequently, GLAD utilizes a +temporal-attentive graph edge anomaly detection model for identifying anomalous +relations in these dynamic log graphs. This model employs a Graph Neural +Network (GNN)-based encoder enhanced with transformers to capture content, +structural and temporal features. We evaluate our proposed method on three +datasets, and the results demonstrate the effectiveness of GLAD in detecting +anomalies indicated by varying relational patterns. + +
+
+ comment: Accepted by ICKG 2023 +
+
+
+
+
+ + ☆ A Survey of Hallucination in Large Foundation Models + + +
+ Hallucination in a foundation model (FM) refers to the generation of content +that strays from factual reality or includes fabricated information. This +survey paper provides an extensive overview of recent efforts that aim to +identify, elucidate, and tackle the problem of hallucination, with a particular +focus on ``Large'' Foundation Models (LFMs). The paper classifies various types +of hallucination phenomena that are specific to LFMs and establishes evaluation +criteria for assessing the extent of hallucination. It also examines existing +strategies for mitigating hallucination in LFMs and discusses potential +directions for future research in this area. Essentially, the paper offers a +comprehensive examination of the challenges and solutions related to +hallucination in LFMs. + +
+
+
+
+
+ + ☆ SAGE: Structured Attribute Value Generation for Billion-Scale Product + Catalogs + + +
+ We introduce SAGE; a Generative LLM for inferring attribute values for +products across world-wide e-Commerce catalogs. We introduce a novel +formulation of the attribute-value prediction problem as a Seq2Seq +summarization task, across languages, product types and target attributes. Our +novel modeling approach lifts the restriction of predicting attribute values +within a pre-specified set of choices, as well as, the requirement that the +sought attribute values need to be explicitly mentioned in the text. SAGE can +infer attribute values even when such values are mentioned implicitly using +periphrastic language, or not-at-all-as is the case for common-sense defaults. +Additionally, SAGE is capable of predicting whether an attribute is +inapplicable for the product at hand, or non-obtainable from the available +information. SAGE is the first method able to tackle all aspects of the +attribute-value-prediction task as they arise in practical settings in +e-Commerce catalogs. A comprehensive set of experiments demonstrates the +effectiveness of the proposed approach, as well as, its superiority against +state-of-the-art competing alternatives. Moreover, our experiments highlight +SAGE's ability to tackle the task of predicting attribute values in zero-shot +setting; thereby, opening up opportunities for significantly reducing the +overall number of labeled examples required for training. + +
+
+ comment: (17 pages) +
+
+
+
+
+ + ☆ Distributionally-Informed Recommender System Evaluation + + +
+ Current practice for evaluating recommender systems typically focuses on +point estimates of user-oriented effectiveness metrics or business metrics, +sometimes combined with additional metrics for considerations such as diversity +and novelty. In this paper, we argue for the need for researchers and +practitioners to attend more closely to various distributions that arise from a +recommender system (or other information access system) and the sources of +uncertainty that lead to these distributions. One immediate implication of our +argument is that both researchers and practitioners must report and examine +more thoroughly the distribution of utility between and within different +stakeholder groups. However, distributions of various forms arise in many more +aspects of the recommender systems experimental process, and distributional +thinking has substantial ramifications for how we design, evaluate, and present +recommender systems evaluation and research results. Leveraging and emphasizing +distributions in the evaluation of recommender systems is a necessary step to +ensure that the systems provide appropriate and equitably-distributed benefit +to the people they affect. + +
+
+ comment: Accepted to ACM Transactions on Recommender Systems +
+
+
+
+
+ + ☆ Hierarchical Multi-Task Learning Framework for Session-based + Recommendations RecSys 2023 + + +
+ While session-based recommender systems (SBRSs) have shown superior +recommendation performance, multi-task learning (MTL) has been adopted by SBRSs +to enhance their prediction accuracy and generalizability further. Hierarchical +MTL (H-MTL) sets a hierarchical structure between prediction tasks and feeds +outputs from auxiliary tasks to main tasks. This hierarchy leads to richer +input features for main tasks and higher interpretability of predictions, +compared to existing MTL frameworks. However, the H-MTL framework has not been +investigated in SBRSs yet. In this paper, we propose HierSRec which +incorporates the H-MTL architecture into SBRSs. HierSRec encodes a given +session with a metadata-aware Transformer and performs next-category prediction +(i.e., auxiliary task) with the session encoding. Next, HierSRec conducts +next-item prediction (i.e., main task) with the category prediction result and +session encoding. For scalable inference, HierSRec creates a compact set of +candidate items (e.g., 4% of total items) per test example using the category +prediction. Experiments show that HierSRec outperforms existing SBRSs as per +next-item prediction accuracy on two session-based recommendation datasets. The +accuracy of HierSRec measured with the carefully-curated candidate items aligns +with the accuracy of HierSRec calculated with all items, which validates the +usefulness of our candidate generation scheme via H-MTL. + +
+
+ comment: Accepted at the 6th Workshop on Online Recommender Systems and User + Modeling @ ACM RecSys 2023 +
+
+
+
+
+ + ♻ ☆ A Review of Modern Fashion Recommender Systems + + +
+ The textile and apparel industries have grown tremendously over the last few +years. Customers no longer have to visit many stores, stand in long queues, or +try on garments in dressing rooms as millions of products are now available in +online catalogs. However, given the plethora of options available, an effective +recommendation system is necessary to properly sort, order, and communicate +relevant product material or information to users. Effective fashion RS can +have a noticeable impact on billions of customers' shopping experiences and +increase sales and revenues on the provider side. The goal of this survey is to +provide a review of recommender systems that operate in the specific vertical +domain of garment and fashion products. We have identified the most pressing +challenges in fashion RS research and created a taxonomy that categorizes the +literature according to the objective they are trying to accomplish (e.g., item +or outfit recommendation, size recommendation, explainability, among others) +and type of side-information (users, items, context). We have also identified +the most important evaluation goals and perspectives (outfit generation, outfit +recommendation, pairing recommendation, and fill-in-the-blank outfit +compatibility prediction) and the most commonly used datasets and evaluation +metrics. + +
+
+ comment: 38 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ EulerNet: Adaptive Feature Interaction Learning via Euler's Formula for + CTR Prediction SIGIR'23 + + +
+ Learning effective high-order feature interactions is very crucial in the CTR +prediction task. However, it is very time-consuming to calculate high-order +feature interactions with massive features in online e-commerce platforms. Most +existing methods manually design a maximal order and further filter out the +useless interactions from them. Although they reduce the high computational +costs caused by the exponential growth of high-order feature combinations, they +still suffer from the degradation of model capability due to the suboptimal +learning of the restricted feature orders. The solution to maintain the model +capability and meanwhile keep it efficient is a technical challenge, which has +not been adequately addressed. To address this issue, we propose an adaptive +feature interaction learning model, named as EulerNet, in which the feature +interactions are learned in a complex vector space by conducting space mapping +according to Euler's formula. EulerNet converts the exponential powers of +feature interactions into simple linear combinations of the modulus and phase +of the complex features, making it possible to adaptively learn the high-order +feature interactions in an efficient way. Furthermore, EulerNet incorporates +the implicit and explicit feature interactions into a unified architecture, +which achieves the mutual enhancement and largely boosts the model +capabilities. Such a network can be fully learned from data, with no need of +pre-designed form or order for feature interactions. Extensive experiments +conducted on three public datasets have demonstrated the effectiveness and +efficiency of our approach. Our code is available at: +https://github.com/RUCAIBox/EulerNet. + +
+
+ comment: 10 pages, 7 figures, accepted for publication in SIGIR'23 +
+
+
+
+
+ + ♻ ☆ A Diffusion model for POI recommendation + + +
+ Next Point-of-Interest (POI) recommendation is a critical task in +location-based services that aim to provide personalized suggestions for the +user's next destination. Previous works on POI recommendation have laid focused +on modeling the user's spatial preference. However, existing works that +leverage spatial information are only based on the aggregation of users' +previous visited positions, which discourages the model from recommending POIs +in novel areas. This trait of position-based methods will harm the model's +performance in many situations. Additionally, incorporating sequential +information into the user's spatial preference remains a challenge. In this +paper, we propose Diff-POI: a Diffusion-based model that samples the user's +spatial preference for the next POI recommendation. Inspired by the wide +application of diffusion algorithm in sampling from distributions, Diff-POI +encodes the user's visiting sequence and spatial character with two +tailor-designed graph encoding modules, followed by a diffusion-based sampling +strategy to explore the user's spatial visiting trends. We leverage the +diffusion process and its reversed form to sample from the posterior +distribution and optimized the corresponding score function. We design a joint +training and inference framework to optimize and evaluate the proposed +Diff-POI. Extensive experiments on four real-world POI recommendation datasets +demonstrate the superiority of our Diff-POI over state-of-the-art baseline +methods. Further ablation and parameter studies on Diff-POI reveal the +functionality and effectiveness of the proposed diffusion-based sampling +strategy for addressing the limitations of existing methods. + +
+
+ comment: Accepted by ACM Transactions on Information Systems (TOIS 2023) +
+
+
+
+
+ + ♻ ☆ Pre-train, Prompt and Recommendation: A Comprehensive Survey of Language + Modelling Paradigm Adaptations in Recommender Systems ACL + + +
+ The emergence of Pre-trained Language Models (PLMs) has achieved tremendous +success in the field of Natural Language Processing (NLP) by learning universal +representations on large corpora in a self-supervised manner. The pre-trained +models and the learned representations can be beneficial to a series of +downstream NLP tasks. This training paradigm has recently been adapted to the +recommendation domain and is considered a promising approach by both academia +and industry. In this paper, we systematically investigate how to extract and +transfer knowledge from pre-trained models learned by different PLM-related +training paradigms to improve recommendation performance from various +perspectives, such as generality, sparsity, efficiency and effectiveness. +Specifically, we propose a comprehensive taxonomy to divide existing PLM-based +recommender systems w.r.t. their training strategies and objectives. Then, we +analyze and summarize the connection between PLM-based training paradigms and +different input data types for recommender systems. Finally, we elaborate on +open issues and future research directions in this vibrant field. + +
+
+ comment: Accepted for publication at Transactions of the Association for + Computational Linguistics (TACL) in September 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 157 + +
+
+
+ + ☆ LEAP Hand: Low-Cost, Efficient, and Anthropomorphic Hand for Robot + Learning + + +
+ Dexterous manipulation has been a long-standing challenge in robotics. While +machine learning techniques have shown some promise, results have largely been +currently limited to simulation. This can be mostly attributed to the lack of +suitable hardware. In this paper, we present LEAP Hand, a low-cost dexterous +and anthropomorphic hand for machine learning research. In contrast to previous +hands, LEAP Hand has a novel kinematic structure that allows maximal dexterity +regardless of finger pose. LEAP Hand is low-cost and can be assembled in 4 +hours at a cost of 2000 USD from readily available parts. It is capable of +consistently exerting large torques over long durations of time. We show that +LEAP Hand can be used to perform several manipulation tasks in the real world +-- from visual teleoperation to learning from passive video data and sim2real. +LEAP Hand significantly outperforms its closest competitor Allegro Hand in all +our experiments while being 1/8th of the cost. We release detailed assembly +instructions, the Sim2Real pipeline and a development platform with useful APIs +on our website at https://leap-hand.github.io/ + +
+
+ comment: Website at https://leap-hand.github.io/ +
+
+
+
+
+ + ☆ Unveiling the potential of large language models in generating semantic + and cross-language clones SC + + +
+ Semantic and Cross-language code clone generation may be useful for code +reuse, code comprehension, refactoring and benchmarking. OpenAI's GPT model has +potential in such clone generation as GPT is used for text generation. When +developers copy/paste codes from Stack Overflow (SO) or within a system, there +might be inconsistent changes leading to unexpected behaviours. Similarly, if +someone possesses a code snippet in a particular programming language but seeks +equivalent functionality in a different language, a semantic cross-language +code clone generation approach could provide valuable assistance.In this study, +using SemanticCloneBench as a vehicle, we evaluated how well the GPT-3 model +could help generate semantic and cross-language clone variants for a given +fragment.We have comprised a diverse set of code fragments and assessed GPT-3s +performance in generating code variants.Through extensive experimentation and +analysis, where 9 judges spent 158 hours to validate, we investigate the +model's ability to produce accurate and semantically correct variants. Our +findings shed light on GPT-3's strengths in code generation, offering insights +into the potential applications and challenges of using advanced language +models in software development. Our quantitative analysis yields compelling +results. In the realm of semantic clones, GPT-3 attains an impressive accuracy +of 62.14% and 0.55 BLEU score, achieved through few-shot prompt engineering. +Furthermore, the model shines in transcending linguistic confines, boasting an +exceptional 91.25% accuracy in generating cross-language clones + +
+
+ comment: Accepted in IWSC +
+
+
+
+
+ + ☆ On Computationally Efficient Learning of Exponential Family + Distributions + + +
+ We consider the classical problem of learning, with arbitrary accuracy, the +natural parameters of a $k$-parameter truncated \textit{minimal} exponential +family from i.i.d. samples in a computationally and statistically efficient +manner. We focus on the setting where the support as well as the natural +parameters are appropriately bounded. While the traditional maximum likelihood +estimator for this class of exponential family is consistent, asymptotically +normal, and asymptotically efficient, evaluating it is computationally hard. In +this work, we propose a novel loss function and a computationally efficient +estimator that is consistent as well as asymptotically normal under mild +conditions. We show that, at the population level, our method can be viewed as +the maximum likelihood estimation of a re-parameterized distribution belonging +to the same class of exponential family. Further, we show that our estimator +can be interpreted as a solution to minimizing a particular Bregman score as +well as an instance of minimizing the \textit{surrogate} likelihood. We also +provide finite sample guarantees to achieve an error (in $\ell_2$-norm) of +$\alpha$ in the parameter estimation with sample complexity $O({\sf +poly}(k)/\alpha^2)$. Our method achives the order-optimal sample complexity of +$O({\sf log}(k)/\alpha^2)$ when tailored for node-wise-sparse Markov random +fields. Finally, we demonstrate the performance of our estimator via numerical +experiments. + +
+
+ comment: An earlier version of this work arXiv:2110.15397 was presented at the + Neural Information Processing Systems Conference in December 2021 titled "A + Computationally Efficient Method for Learning Exponential Family + Distributions" +
+
+
+
+
+ + ☆ Ensemble Mask Networks + + +
+ Can an $\mathbb{R}^n\rightarrow \mathbb{R}^n$ feedforward network learn +matrix-vector multiplication? This study introduces two mechanisms - flexible +masking to take matrix inputs, and a unique network pruning to respect the +mask's dependency structure. Networks can approximate fixed operations such as +matrix-vector multiplication $\phi(A,x) \rightarrow Ax$, motivating the +mechanisms introduced with applications towards litmus-testing dependencies or +interaction order in graph-based models. + +
+
+
+
+
+ + ☆ InstaFlow: One Step is Enough for High-Quality Diffusion-Based + Text-to-Image Generation + + +
+ Diffusion models have revolutionized text-to-image generation with its +exceptional quality and creativity. However, its multi-step sampling process is +known to be slow, often requiring tens of inference steps to obtain +satisfactory results. Previous attempts to improve its sampling speed and +reduce computational costs through distillation have been unsuccessful in +achieving a functional one-step model. In this paper, we explore a recent +method called Rectified Flow, which, thus far, has only been applied to small +datasets. The core of Rectified Flow lies in its \emph{reflow} procedure, which +straightens the trajectories of probability flows, refines the coupling between +noises and images, and facilitates the distillation process with student +models. We propose a novel text-conditioned pipeline to turn Stable Diffusion +(SD) into an ultra-fast one-step model, in which we find reflow plays a +critical role in improving the assignment between noise and images. Leveraging +our new pipeline, we create, to the best of our knowledge, the first one-step +diffusion-based text-to-image generator with SD-level image quality, achieving +an FID (Frechet Inception Distance) of $23.3$ on MS COCO 2017-5k, surpassing +the previous state-of-the-art technique, progressive distillation, by a +significant margin ($37.2$ $\rightarrow$ $23.3$ in FID). By utilizing an +expanded network with 1.7B parameters, we further improve the FID to $22.4$. We +call our one-step models \emph{InstaFlow}. On MS COCO 2014-30k, InstaFlow +yields an FID of $13.1$ in just $0.09$ second, the best in $\leq 0.1$ second +regime, outperforming the recent StyleGAN-T ($13.9$ in $0.1$ second). Notably, +the training of InstaFlow only costs 199 A100 GPU days. Project +page:~\url{https://github.com/gnobitab/InstaFlow}. + +
+
+
+
+
+ + ☆ Using Reed-Muller Codes for Classification with Rejection and Recovery + + +
+ When deploying classifiers in the real world, users expect them to respond to +inputs appropriately. However, traditional classifiers are not equipped to +handle inputs which lie far from the distribution they were trained on. +Malicious actors can exploit this defect by making adversarial perturbations +designed to cause the classifier to give an incorrect output. +Classification-with-rejection methods attempt to solve this problem by allowing +networks to refuse to classify an input in which they have low confidence. This +works well for strongly adversarial examples, but also leads to the rejection +of weakly perturbed images, which intuitively could be correctly classified. To +address these issues, we propose Reed-Muller Aggregation Networks (RMAggNet), a +classifier inspired by Reed-Muller error-correction codes which can correct and +reject inputs. This paper shows that RMAggNet can minimise incorrectness while +maintaining good correctness over multiple adversarial attacks at different +perturbation budgets by leveraging the ability to correct errors in the +classification process. This provides an alternative +classification-with-rejection method which can reduce the amount of additional +processing in situations where a small number of incorrect classifications are +permissible. + +
+
+ comment: 38 pages, 7 figures +
+
+
+
+
+ + ☆ Generalized Regret Analysis of Thompson Sampling using Fractional + Posteriors + + +
+ Thompson sampling (TS) is one of the most popular and earliest algorithms to +solve stochastic multi-armed bandit problems. We consider a variant of TS, +named $\alpha$-TS, where we use a fractional or $\alpha$-posterior +($\alpha\in(0,1)$) instead of the standard posterior distribution. To compute +an $\alpha$-posterior, the likelihood in the definition of the standard +posterior is tempered with a factor $\alpha$. For $\alpha$-TS we obtain both +instance-dependent $\mathcal{O}\left(\sum_{k \neq i^*} +\Delta_k\left(\frac{\log(T)}{C(\alpha)\Delta_k^2} + \frac{1}{2} \right)\right)$ +and instance-independent $\mathcal{O}(\sqrt{KT\log K})$ frequentist regret +bounds under very mild conditions on the prior and reward distributions, where +$\Delta_k$ is the gap between the true mean rewards of the $k^{th}$ and the +best arms, and $C(\alpha)$ is a known constant. Both the sub-Gaussian and +exponential family models satisfy our general conditions on the reward +distribution. Our conditions on the prior distribution just require its density +to be positive, continuous, and bounded. We also establish another +instance-dependent regret upper bound that matches (up to constants) to that of +improved UCB [Auer and Ortner, 2010]. Our regret analysis carefully combines +recent theoretical developments in the non-asymptotic concentration analysis +and Bernstein-von Mises type results for the $\alpha$-posterior distribution. +Moreover, our analysis does not require additional structural properties such +as closed-form posteriors or conjugate priors. + +
+
+
+
+
+ + ☆ Band-gap regression with architecture-optimized message-passing neural + networks + + +
+ Graph-based neural networks and, specifically, message-passing neural +networks (MPNNs) have shown great potential in predicting physical properties +of solids. In this work, we train an MPNN to first classify materials through +density functional theory data from the AFLOW database as being metallic or +semiconducting/insulating. We then perform a neural-architecture search to +explore the model architecture and hyperparameter space of MPNNs to predict the +band gaps of the materials identified as non-metals. The parameters in the +search include the number of message-passing steps, latent size, and +activation-function, among others. The top-performing models from the search +are pooled into an ensemble that significantly outperforms existing models from +the literature. Uncertainty quantification is evaluated with Monte-Carlo +Dropout and ensembling, with the ensemble method proving superior. The domain +of applicability of the ensemble model is analyzed with respect to the crystal +systems, the inclusion of a Hubbard parameter in the density functional +calculations, and the atomic species building up the materials. + +
+
+
+
+
+ + ☆ Learning Minimalistic Tsetlin Machine Clauses with Markov + Boundary-Guided Pruning + + +
+ A set of variables is the Markov blanket of a random variable if it contains +all the information needed for predicting the variable. If the blanket cannot +be reduced without losing useful information, it is called a Markov boundary. +Identifying the Markov boundary of a random variable is advantageous because +all variables outside the boundary are superfluous. Hence, the Markov boundary +provides an optimal feature set. However, learning the Markov boundary from +data is challenging for two reasons. If one or more variables are removed from +the Markov boundary, variables outside the boundary may start providing +information. Conversely, variables within the boundary may stop providing +information. The true role of each candidate variable is only manifesting when +the Markov boundary has been identified. In this paper, we propose a new +Tsetlin Machine (TM) feedback scheme that supplements Type I and Type II +feedback. The scheme introduces a novel Finite State Automaton - a +Context-Specific Independence Automaton. The automaton learns which features +are outside the Markov boundary of the target, allowing them to be pruned from +the TM during learning. We investigate the new scheme empirically, showing how +it is capable of exploiting context-specific independence to find Markov +boundaries. Further, we provide a theoretical analysis of convergence. Our +approach thus connects the field of Bayesian networks (BN) with TMs, +potentially opening up for synergies when it comes to inference and learning, +including TM-produced Bayesian knowledge bases and TM-based Bayesian inference. + +
+
+ comment: Accepted to ISTM2023, 8 pages, 6 figures +
+
+
+
+
+ + ☆ Semantic and Articulated Pedestrian Sensing Onboard a Moving Vehicle + + +
+ It is difficult to perform 3D reconstruction from on-vehicle gathered video +due to the large forward motion of the vehicle. Even object detection and human +sensing models perform significantly worse on onboard videos when compared to +standard benchmarks because objects often appear far away from the camera +compared to the standard object detection benchmarks, image quality is often +decreased by motion blur and occlusions occur often. This has led to the +popularisation of traffic data-specific benchmarks. Recently Light Detection +And Ranging (LiDAR) sensors have become popular to directly estimate depths +without the need to perform 3D reconstructions. However, LiDAR-based methods +still lack in articulated human detection at a distance when compared to +image-based methods. We hypothesize that benchmarks targeted at articulated +human sensing from LiDAR data could bring about increased research in human +sensing and prediction in traffic and could lead to improved traffic safety for +pedestrians. + +
+
+
+
+
+ + ☆ Modeling Supply and Demand in Public Transportation Systems + + +
+ The Harrisonburg Department of Public Transportation (HDPT) aims to leverage +their data to improve the efficiency and effectiveness of their operations. We +construct two supply and demand models that help the department identify gaps +in their service. The models take many variables into account, including the +way that the HDPT reports to the federal government and the areas with the most +vulnerable populations in Harrisonburg City. We employ data analysis and +machine learning techniques to make our predictions. + +
+
+ comment: 28 pages, 2022 REU project at James Madison University +
+
+
+
+
+ + ☆ Transferability analysis of data-driven additive manufacturing + knowledge: a case study between powder bed fusion and directed energy + deposition + + +
+ Data-driven research in Additive Manufacturing (AM) has gained significant +success in recent years. This has led to a plethora of scientific literature to +emerge. The knowledge in these works consists of AM and Artificial Intelligence +(AI) contexts that have not been mined and formalized in an integrated way. +Moreover, no tools or guidelines exist to support data-driven knowledge +transfer from one context to another. As a result, data-driven solutions using +specific AI techniques are being developed and validated only for specific AM +process technologies. There is a potential to exploit the inherent similarities +across various AM technologies and adapt the existing solutions from one +process or problem to another using AI, such as Transfer Learning. We propose a +three-step knowledge transferability analysis framework in AM to support +data-driven AM knowledge transfer. As a prerequisite to transferability +analysis, AM knowledge is featurized into identified knowledge components. The +framework consists of pre-transfer, transfer, and post-transfer steps to +accomplish knowledge transfer. A case study is conducted between flagship metal +AM processes. Laser Powder Bed Fusion (LPBF) is the source of knowledge +motivated by its relative matureness in applying AI over Directed Energy +Deposition (DED), which drives the need for knowledge transfer as the less +explored target process. We show successful transfer at different levels of the +data-driven solution, including data representation, model architecture, and +model parameters. The pipeline of AM knowledge transfer can be automated in the +future to allow efficient cross-context or cross-process knowledge exchange. + +
+
+ comment: 11 pages, 7 figures. This paper has been accepted to be published in + the proceedings of IDETC-CIE 2023 +
+
+
+
+
+ + ☆ ELRA: Exponential learning rate adaption gradient descent optimization + method + + +
+ We present a novel, fast (exponential rate adaption), ab initio +(hyper-parameter-free) gradient based optimizer algorithm. The main idea of the +method is to adapt the learning rate $\alpha$ by situational awareness, mainly +striving for orthogonal neighboring gradients. The method has a high success +and fast convergence rate and does not rely on hand-tuned parameters giving it +greater universality. It can be applied to problems of any dimensions n and +scales only linearly (of order O(n)) with the dimension of the problem. It +optimizes convex and non-convex continuous landscapes providing some kind of +gradient. In contrast to the Ada-family (AdaGrad, AdaMax, AdaDelta, Adam, etc.) +the method is rotation invariant: optimization path and performance are +independent of coordinate choices. The impressive performance is demonstrated +by extensive experiments on the MNIST benchmark data-set against +state-of-the-art optimizers. We name this new class of optimizers after its +core idea Exponential Learning Rate Adaption - ELRA. We present it in two +variants c2min and p2min with slightly different control. The authors strongly +believe that ELRA will open a completely new research direction for gradient +descent optimize. + +
+
+ comment: 9 pages, 11 figures +
+
+
+
+
+ + ☆ ssVERDICT: Self-Supervised VERDICT-MRI for Enhanced Prostate Tumour + Characterisation + + +
+ MRI is increasingly being used in the diagnosis of prostate cancer (PCa), +with diffusion MRI (dMRI) playing an integral role. When combined with +computational models, dMRI can estimate microstructural information such as +cell size. Conventionally, such models are fit with a nonlinear least squares +(NLLS) curve fitting approach, associated with a high computational cost. +Supervised deep neural networks (DNNs) are an efficient alternative, however +their performance is significantly affected by the underlying distribution of +the synthetic training data. Self-supervised learning is an attractive +alternative, where instead of using a separate training dataset, the network +learns the features of the input data itself. This approach has only been +applied to fitting of trivial dMRI models thus far. Here, we introduce a +self-supervised DNN to estimate the parameters of the VERDICT (Vascular, +Extracellular and Restricted DIffusion for Cytometry in Tumours) model for +prostate. We demonstrate, for the first time, fitting of a complex +three-compartment biophysical model with machine learning without the +requirement of explicit training labels. We compare the estimation performance +to baseline NLLS and supervised DNN methods, observing improvement in +estimation accuracy and reduction in bias with respect to ground truth values. +Our approach also achieves a higher confidence level for discrimination between +cancerous and benign prostate tissue in comparison to the other methods on a +dataset of 20 PCa patients, indicating potential for accurate tumour +characterisation. + +
+
+ comment: 12 pages, 5 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Toward Discretization-Consistent Closure Schemes for Large Eddy + Simulation Using Reinforcement Learning + + +
+ We propose a novel method for developing discretization-consistent closure +schemes for implicitly filtered Large Eddy Simulation (LES). In implicitly +filtered LES, the induced filter kernel, and thus the closure terms, are +determined by the properties of the grid and the discretization operator, +leading to additional computational subgrid terms that are generally unknown in +a priori analysis. Therefore, the task of adapting the coefficients of LES +closure models is formulated as a Markov decision process and solved in an a +posteriori manner with Reinforcement Learning (RL). This allows to adjust the +model to the actual discretization as it also incorporates the interaction +between the discretization and the model itself. This optimization framework is +applied to both explicit and implicit closure models. An element-local eddy +viscosity model is optimized as the explicit model. For the implicit modeling, +RL is applied to identify an optimal blending strategy for a hybrid +discontinuous Galerkin (DG) and finite volume scheme. All newly derived models +achieve accurate and consistent results, either matching or outperforming +classical state-of-the-art models for different discretizations and +resolutions. Moreover, the explicit model is demonstrated to adapt its +distribution of viscosity within the DG elements to the inhomogeneous +discretization properties of the operator. In the implicit case, the optimized +hybrid scheme renders itself as a viable modeling ansatz that could initiate a +new class of high order schemes for compressible turbulence. Overall, the +results demonstrate that the proposed RL optimization can provide +discretization-consistent closures that could reduce the uncertainty in +implicitly filtered LES. + +
+
+ comment: 24 pages, 14 figures +
+
+
+
+
+ + ☆ Speciality vs Generality: An Empirical Study on Catastrophic Forgetting + in Fine-tuning Foundation Models + + +
+ Foundation models, including Vision Language Models (VLMs) and Large Language +Models (LLMs), possess the $generality$ to handle diverse distributions and +tasks, which stems from their extensive pre-training datasets. The fine-tuning +of foundation models is a common practice to enhance task performance or align +the model's behavior with human expectations, allowing them to gain +$speciality$. However, the small datasets used for fine-tuning may not +adequately cover the diverse distributions and tasks encountered during +pre-training. Consequently, the pursuit of speciality during fine-tuning can +lead to a loss of {generality} in the model, which is related to catastrophic +forgetting (CF) in deep learning. In this study, we demonstrate this phenomenon +in both VLMs and LLMs. For instance, fine-tuning VLMs like CLIP on ImageNet +results in a loss of generality in handling diverse distributions, and +fine-tuning LLMs like Galactica in the medical domain leads to a loss in +following instructions and common sense. + To address the trade-off between the speciality and generality, we +investigate multiple regularization methods from continual learning, the weight +averaging method (Wise-FT) from out-of-distributional (OOD) generalization, +which interpolates parameters between pre-trained and fine-tuned models, and +parameter-efficient fine-tuning methods like Low-Rank Adaptation (LoRA). Our +findings show that both continual learning and Wise-ft methods effectively +mitigate the loss of generality, with Wise-FT exhibiting the strongest +performance in balancing speciality and generality. + +
+
+ comment: 30 Pages +
+
+
+
+
+ + ☆ Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation + + +
+ One primary topic of multi-modal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multi-modal cooperation, which could not jointly +utilize all modalities well. Some methods are proposed to identify and enhance +the worse learnt modality, but are often hard to provide the fine-grained +observation of multi-modal cooperation at sample-level with theoretical +support. Hence, it is essential to reasonably observe and improve the +fine-grained cooperation between modalities, especially when facing realistic +scenarios where the modality discrepancy could vary across different samples. +To this end, we introduce a fine-grained modality valuation metric to evaluate +the contribution of each modality at sample-level. Via modality valuation, we +regretfully observe that the multi-modal model tends to rely on one specific +modality, resulting in other modalities being low-contributing. We further +analyze this issue and improve cooperation between modalities by enhancing the +discriminative ability of low-contributing modalities in a targeted manner. +Overall, our methods reasonably observe the fine-grained uni-modal contribution +at sample-level and achieve considerable improvement on different multi-modal +models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Rethinking Evaluation Metric for Probability Estimation Models Using + Esports Data + + +
+ Probability estimation models play an important role in various fields, such +as weather forecasting, recommendation systems, and sports analysis. Among +several models estimating probabilities, it is difficult to evaluate which +model gives reliable probabilities since the ground-truth probabilities are not +available. The win probability estimation model for esports, which calculates +the win probability under a certain game state, is also one of the fields being +actively studied in probability estimation. However, most of the previous works +evaluated their models using accuracy, a metric that only can measure the +performance of discrimination. In this work, we firstly investigate the Brier +score and the Expected Calibration Error (ECE) as a replacement of accuracy +used as a performance evaluation metric for win probability estimation models +in esports field. Based on the analysis, we propose a novel metric called +Balance score which is a simple yet effective metric in terms of six good +properties that probability estimation metric should have. Under the general +condition, we also found that the Balance score can be an effective +approximation of the true expected calibration error which has been imperfectly +approximated by ECE using the binning technique. Extensive evaluations using +simulation studies and real game snapshot data demonstrate the promising +potential to adopt the proposed metric not only for the win probability +estimation model for esports but also for evaluating general probability +estimation models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Consistency and adaptivity are complementary targets for the validation + of variance-based uncertainty quantification metrics in machine learning + regression tasks + + +
+ Reliable uncertainty quantification (UQ) in machine learning (ML) regression +tasks is becoming the focus of many studies in materials and chemical science. +It is now well understood that average calibration is insufficient, and most +studies implement additional methods testing the conditional calibration with +respect to uncertainty, i.e. consistency. Consistency is assessed mostly by +so-called reliability diagrams. There exists however another way beyond average +calibration, which is conditional calibration with respect to input features, +i.e. adaptivity. In practice, adaptivity is the main concern of the final users +of a ML-UQ method, seeking for the reliability of predictions and uncertainties +for any point in features space. This article aims to show that consistency and +adaptivity are complementary validation targets, and that a good consistency +does not imply a good adaptivity. Adapted validation methods are proposed and +illustrated on a representative example. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.07170 +
+
+
+
+
+ + ☆ Risk-Aware Reinforcement Learning through Optimal Transport Theory + + +
+ In the dynamic and uncertain environments where reinforcement learning (RL) +operates, risk management becomes a crucial factor in ensuring reliable +decision-making. Traditional RL approaches, while effective in reward +optimization, often overlook the landscape of potential risks. In response, +this paper pioneers the integration of Optimal Transport (OT) theory with RL to +create a risk-aware framework. Our approach modifies the objective function, +ensuring that the resulting policy not only maximizes expected rewards but also +respects risk constraints dictated by OT distances between state visitation +distributions and the desired risk profiles. By leveraging the mathematical +precision of OT, we offer a formulation that elevates risk considerations +alongside conventional RL objectives. Our contributions are substantiated with +a series of theorems, mapping the relationships between risk distributions, +optimal value functions, and policy behaviors. Through the lens of OT, this +work illuminates a promising direction for RL, ensuring a balanced fusion of +reward pursuit and risk awareness. + +
+
+
+
+
+ + ☆ The first step is the hardest: Pitfalls of Representing and Tokenizing + Temporal Data for Large Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable generalization +across diverse tasks, leading individuals to increasingly use them as personal +assistants and universal computing engines. Nevertheless, a notable obstacle +emerges when feeding numerical/temporal data into these models, such as data +sourced from wearables or electronic health records. LLMs employ tokenizers in +their input that break down text into smaller units. However, tokenizers are +not designed to represent numerical values and might struggle to understand +repetitive patterns and context, treating consecutive values as separate tokens +and disregarding their temporal relationships. Here, we discuss recent works +that employ LLMs for human-centric tasks such as in mobile health sensing and +present a case study showing that popular LLMs tokenize temporal data +incorrectly. To address that, we highlight potential solutions such as prompt +tuning with lightweight embedding layers as well as multimodal adapters, that +can help bridge this "modality gap". While the capability of language models to +generalize to other modalities with minimal or no finetuning is exciting, this +paper underscores the fact that their outputs cannot be meaningful if they +stumble over input nuances. + +
+
+ comment: Accepted at the Generative AI for Pervasive Computing Symposium + (GenAI4PC) at UbiComp 2023 +
+
+
+
+
+ + ☆ A Consistent and Scalable Algorithm for Best Subset Selection in Single + Index Models + + +
+ Analysis of high-dimensional data has led to increased interest in both +single index models (SIMs) and best subset selection. SIMs provide an +interpretable and flexible modeling framework for high-dimensional data, while +best subset selection aims to find a sparse model from a large set of +predictors. However, best subset selection in high-dimensional models is known +to be computationally intractable. Existing methods tend to relax the +selection, but do not yield the best subset solution. In this paper, we +directly tackle the intractability by proposing the first provably scalable +algorithm for best subset selection in high-dimensional SIMs. Our algorithmic +solution enjoys the subset selection consistency and has the oracle property +with a high probability. The algorithm comprises a generalized information +criterion to determine the support size of the regression coefficients, +eliminating the model selection tuning. Moreover, our method does not assume an +error distribution or a specific link function and hence is flexible to apply. +Extensive simulation results demonstrate that our method is not only +computationally efficient but also able to exactly recover the best subset in +various settings (e.g., linear regression, Poisson regression, heteroscedastic +models). + +
+
+
+
+
+ + ☆ Long-term drought prediction using deep neural networks based on + geospatial weather data + + +
+ The accurate prediction of drought probability in specific regions is crucial +for informed decision-making in agricultural practices. It is important to make +predictions one year in advance, particularly for long-term decisions. However, +forecasting this probability presents challenges due to the complex interplay +of various factors within the region of interest and neighboring areas. In this +study, we propose an end-to-end solution to address this issue based on various +spatiotemporal neural networks. The models considered focus on predicting the +drought intensity based on the Palmer Drought Severity Index (PDSI) for +subregions of interest, leveraging intrinsic factors and insights from climate +models to enhance drought predictions. + Comparative evaluations demonstrate the superior accuracy of Convolutional +LSTM (ConvLSTM) and transformer models compared to baseline gradient boosting +and logistic regression solutions. The two former models achieved impressive +ROC AUC scores from 0.90 to 0.70 for forecast horizons from one to six months, +outperforming baseline models. The transformer showed superiority for shorter +horizons, while ConvLSTM did so for longer horizons. Thus, we recommend +selecting the models accordingly for long-term drought forecasting. + To ensure the broad applicability of the considered models, we conduct +extensive validation across regions worldwide, considering different +environmental conditions. We also run several ablation and sensitivity studies +to challenge our findings and provide additional information on how to solve +the problem. + +
+
+
+
+
+ + ☆ Optimization Guarantees of Unfolded ISTA and ADMM Networks With Smooth + Soft-Thresholding + + +
+ Solving linear inverse problems plays a crucial role in numerous +applications. Algorithm unfolding based, model-aware data-driven approaches +have gained significant attention for effectively addressing these problems. +Learned iterative soft-thresholding algorithm (LISTA) and alternating direction +method of multipliers compressive sensing network (ADMM-CSNet) are two widely +used such approaches, based on ISTA and ADMM algorithms, respectively. In this +work, we study optimization guarantees, i.e., achieving near-zero training loss +with the increase in the number of learning epochs, for finite-layer unfolded +networks such as LISTA and ADMM-CSNet with smooth soft-thresholding in an +over-parameterized (OP) regime. We achieve this by leveraging a modified +version of the Polyak-Lojasiewicz, denoted PL$^*$, condition. Satisfying the +PL$^*$ condition within a specific region of the loss landscape ensures the +existence of a global minimum and exponential convergence from initialization +using gradient descent based methods. Hence, we provide conditions, in terms of +the network width and the number of training samples, on these unfolded +networks for the PL$^*$ condition to hold. We achieve this by deriving the +Hessian spectral norm of these networks. Additionally, we show that the +threshold on the number of training samples increases with the increase in the +network width. Furthermore, we compare the threshold on training samples of +unfolded networks with that of a standard fully-connected feed-forward network +(FFNN) with smooth soft-thresholding non-linearity. We prove that unfolded +networks have a higher threshold value than FFNN. Consequently, one can expect +a better expected error for unfolded networks than FFNN. + +
+
+
+
+
+ + ☆ Assessing the Generalization Gap of Learning-Based Speech Enhancement + Systems in Noisy and Reverberant Environments + + +
+ The acoustic variability of noisy and reverberant speech mixtures is +influenced by multiple factors, such as the spectro-temporal characteristics of +the target speaker and the interfering noise, the signal-to-noise ratio (SNR) +and the room characteristics. This large variability poses a major challenge +for learning-based speech enhancement systems, since a mismatch between the +training and testing conditions can substantially reduce the performance of the +system. Generalization to unseen conditions is typically assessed by testing +the system with a new speech, noise or binaural room impulse response (BRIR) +database different from the one used during training. However, the difficulty +of the speech enhancement task can change across databases, which can +substantially influence the results. The present study introduces a +generalization assessment framework that uses a reference model trained on the +test condition, such that it can be used as a proxy for the difficulty of the +test condition. This allows to disentangle the effect of the change in task +difficulty from the effect of dealing with new data, and thus to define a new +measure of generalization performance termed the generalization gap. The +procedure is repeated in a cross-validation fashion by cycling through multiple +speech, noise, and BRIR databases to accurately estimate the generalization +gap. The proposed framework is applied to evaluate the generalization potential +of a feedforward neural network (FFNN), Conv-TasNet, DCCRN and MANNER. We find +that for all models, the performance degrades the most in speech mismatches, +while good noise and room generalization can be achieved by training on +multiple databases. Moreover, while recent models show higher performance in +matched conditions, their performance substantially decreases in mismatched +conditions and can become inferior to that of the FFNN-based system. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Efficient Memory Management for Large Language Model Serving with + PagedAttention SOSP 2023 + + +
+ High throughput serving of large language models (LLMs) requires batching +sufficiently many requests at a time. However, existing systems struggle +because the key-value cache (KV cache) memory for each request is huge and +grows and shrinks dynamically. When managed inefficiently, this memory can be +significantly wasted by fragmentation and redundant duplication, limiting the +batch size. To address this problem, we propose PagedAttention, an attention +algorithm inspired by the classical virtual memory and paging techniques in +operating systems. On top of it, we build vLLM, an LLM serving system that +achieves (1) near-zero waste in KV cache memory and (2) flexible sharing of KV +cache within and across requests to further reduce memory usage. Our +evaluations show that vLLM improves the throughput of popular LLMs by +2-4$\times$ with the same level of latency compared to the state-of-the-art +systems, such as FasterTransformer and Orca. The improvement is more pronounced +with longer sequences, larger models, and more complex decoding algorithms. +vLLM's source code is publicly available at +https://github.com/vllm-project/vllm + +
+
+ comment: SOSP 2023 +
+
+
+
+
+ + ☆ Elucidating the solution space of extended reverse-time SDE for + diffusion models + + +
+ Diffusion models (DMs) demonstrate potent image generation capabilities in +various generative modeling tasks. Nevertheless, their primary limitation lies +in slow sampling speed, requiring hundreds or thousands of sequential function +evaluations through large neural networks to generate high-quality images. +Sampling from DMs can be seen as solving corresponding stochastic differential +equations (SDEs) or ordinary differential equations (ODEs). In this work, we +formulate the sampling process as an extended reverse-time SDE (ER SDE), +unifying prior explorations into ODEs and SDEs. Leveraging the semi-linear +structure of ER SDE solutions, we offer exact solutions and arbitrarily +high-order approximate solutions for VP SDE and VE SDE, respectively. Based on +the solution space of the ER SDE, we yield mathematical insights elucidating +the superior performance of ODE solvers over SDE solvers in terms of fast +sampling. Additionally, we unveil that VP SDE solvers stand on par with their +VE SDE counterparts. Finally, we devise fast and training-free samplers, ER-SDE +Solvers, elevating the efficiency of stochastic samplers to unprecedented +levels. Experimental results demonstrate achieving 3.45 FID in 20 function +evaluations and 2.24 FID in 50 function evaluations on the ImageNet +64$\times$64 dataset. + +
+
+
+
+
+ + ☆ Certified Robust Models with Slack Control and Large Lipschitz Constants + + +
+ Despite recent success, state-of-the-art learning-based models remain highly +vulnerable to input changes such as adversarial examples. In order to obtain +certifiable robustness against such perturbations, recent work considers +Lipschitz-based regularizers or constraints while at the same time increasing +prediction margin. Unfortunately, this comes at the cost of significantly +decreased accuracy. In this paper, we propose a Calibrated Lipschitz-Margin +Loss (CLL) that addresses this issue and improves certified robustness by +tackling two problems: Firstly, commonly used margin losses do not adjust the +penalties to the shrinking output distribution; caused by minimizing the +Lipschitz constant $K$. Secondly, and most importantly, we observe that +minimization of $K$ can lead to overly smooth decision functions. This limits +the model's complexity and thus reduces accuracy. Our CLL addresses these +issues by explicitly calibrating the loss w.r.t. margin and Lipschitz constant, +thereby establishing full control over slack and improving robustness +certificates even with larger Lipschitz constants. On CIFAR-10, CIFAR-100 and +Tiny-ImageNet, our models consistently outperform losses that leave the +constant unattended. On CIFAR-100 and Tiny-ImageNet, CLL improves upon +state-of-the-art deterministic $L_2$ robust accuracies. In contrast to current +trends, we unlock potential of much smaller models without $K=1$ constraints. + +
+
+ comment: To be published at GCPR 2023 +
+
+
+
+
+ + ☆ Robust-MBDL: A Robust Multi-branch Deep Learning Based Model for + Remaining Useful Life Prediction and Operational Condition Identification of + Rotating Machines + + +
+ In this paper, a Robust Multi-branch Deep learning-based system for remaining +useful life (RUL) prediction and condition operations (CO) identification of +rotating machines is proposed. In particular, the proposed system comprises +main components: (1) an LSTM-Autoencoder to denoise the vibration data; (2) a +feature extraction to generate time-domain, frequency-domain, and +time-frequency based features from the denoised data; (3) a novel and robust +multi-branch deep learning network architecture to exploit the multiple +features. The performance of our proposed system was evaluated and compared to +the state-of-the-art systems on two benchmark datasets of XJTU-SY and +PRONOSTIA. The experimental results prove that our proposed system outperforms +the state-of-the-art systems and presents potential for real-life applications +on bearing machines. + +
+
+
+
+
+ + ☆ Towards Reliable Domain Generalization: A New Dataset and Evaluations + + +
+ There are ubiquitous distribution shifts in the real world. However, deep +neural networks (DNNs) are easily biased towards the training set, which causes +severe performance degradation when they receive out-of-distribution data. Many +methods are studied to train models that generalize under various distribution +shifts in the literature of domain generalization (DG). However, the recent +DomainBed and WILDS benchmarks challenged the effectiveness of these methods. +Aiming at the problems in the existing research, we propose a new domain +generalization task for handwritten Chinese character recognition (HCCR) to +enrich the application scenarios of DG method research. We evaluate eighteen DG +methods on the proposed PaHCC (Printed and Handwritten Chinese Characters) +dataset and show that the performance of existing methods on this dataset is +still unsatisfactory. Besides, under a designed dynamic DG setting, we reveal +more properties of DG methods and argue that only the leave-one-domain-out +protocol is unreliable. We advocate that researchers in the DG community refer +to dynamic performance of methods for more comprehensive and reliable +evaluation. Our dataset and evaluations bring new perspectives to the community +for more substantial progress. We will make our dataset public with the article +published to facilitate the study of domain generalization. + +
+
+
+
+
+ + ☆ Accelerating Edge AI with Morpher: An Integrated Design, Compilation and + Simulation Framework for CGRAs + + +
+ Coarse-Grained Reconfigurable Arrays (CGRAs) hold great promise as +power-efficient edge accelerator, offering versatility beyond AI applications. +Morpher, an open-source, architecture-adaptive CGRA design framework, is +specifically designed to explore the vast design space of CGRAs. The +comprehensive ecosystem of Morpher includes a tailored compiler, simulator, +accelerator synthesis, and validation framework. This study provides an +overview of Morpher, highlighting its capabilities in automatically compiling +AI application kernels onto user-defined CGRA architectures and verifying their +functionality. Through the Morpher framework, the versatility of CGRAs is +harnessed to facilitate efficient compilation and verification of edge AI +applications, covering important kernels representative of a wide range of +embedded AI workloads. Morpher is available online at +https://github.com/ecolab-nus/morpher-v2. + +
+
+ comment: This work was accepted by the Workshop on Compilers, Deployment, and + Tooling for Edge AI (CODAI 2023), co-hosted at Embedded Systems Week on + September 21st, 2023 +
+
+
+
+
+ + ☆ AstroLLaMA: Towards Specialized Foundation Models in Astronomy AACL 2023 + + +
+ Large language models excel in many human-language tasks but often falter in +highly specialized domains like scholarly astronomy. To bridge this gap, we +introduce AstroLLaMA, a 7-billion-parameter model fine-tuned from LLaMA-2 using +over 300,000 astronomy abstracts from arXiv. Optimized for traditional causal +language modeling, AstroLLaMA achieves a 30% lower perplexity than Llama-2, +showing marked domain adaptation. Our model generates more insightful and +scientifically relevant text completions and embedding extraction than +state-of-the-arts foundation models despite having significantly fewer +parameters. AstroLLaMA serves as a robust, domain-specific model with broad +fine-tuning potential. Its public release aims to spur astronomy-focused +research, including automatic paper summarization and conversational agent +development. + +
+
+ comment: 6 pages, 3 figures, submitted to IJCNLP-AACL 2023. Comments are + welcome. The model can be found on Hugging Face - + https://huggingface.co/universeTBD/astrollama +
+
+
+
+
+ + ☆ A robust synthetic data generation framework for machine learning in + High-Resolution Transmission Electron Microscopy (HRTEM) + + +
+ Machine learning techniques are attractive options for developing +highly-accurate automated analysis tools for nanomaterials characterization, +including high-resolution transmission electron microscopy (HRTEM). However, +successfully implementing such machine learning tools can be difficult due to +the challenges in procuring sufficiently large, high-quality training datasets +from experiments. In this work, we introduce Construction Zone, a Python +package for rapidly generating complex nanoscale atomic structures, and develop +an end-to-end workflow for creating large simulated databases for training +neural networks. Construction Zone enables fast, systematic sampling of +realistic nanomaterial structures, and can be used as a random structure +generator for simulated databases, which is important for generating large, +diverse synthetic datasets. Using HRTEM imaging as an example, we train a +series of neural networks on various subsets of our simulated databases to +segment nanoparticles and holistically study the data curation process to +understand how various aspects of the curated simulated data -- including +simulation fidelity, the distribution of atomic structures, and the +distribution of imaging conditions -- affect model performance across several +experimental benchmarks. Using our results, we are able to achieve +state-of-the-art segmentation performance on experimental HRTEM images of +nanoparticles from several experimental benchmarks and, further, we discuss +robust strategies for consistently achieving high performance with machine +learning in experimental settings using purely synthetic data. + +
+
+
+
+
+ + ☆ Fidelity-Induced Interpretable Policy Extraction for Reinforcement + Learning + + +
+ Deep Reinforcement Learning (DRL) has achieved remarkable success in +sequential decision-making problems. However, existing DRL agents make +decisions in an opaque fashion, hindering the user from establishing trust and +scrutinizing weaknesses of the agents. While recent research has developed +Interpretable Policy Extraction (IPE) methods for explaining how an agent takes +actions, their explanations are often inconsistent with the agent's behavior +and thus, frequently fail to explain. To tackle this issue, we propose a novel +method, Fidelity-Induced Policy Extraction (FIPE). Specifically, we start by +analyzing the optimization mechanism of existing IPE methods, elaborating on +the issue of ignoring consistency while increasing cumulative rewards. We then +design a fidelity-induced mechanism by integrate a fidelity measurement into +the reinforcement learning feedback. We conduct experiments in the complex +control environment of StarCraft II, an arena typically avoided by current IPE +methods. The experiment results demonstrate that FIPE outperforms the baselines +in terms of interaction performance and consistency, meanwhile easy to +understand. + +
+
+ comment: 10 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ A General Verification Framework for Dynamical and Control Models via + Certificate Synthesis + + +
+ An emerging branch of control theory specialises in certificate learning, +concerning the specification of a desired (possibly complex) system behaviour +for an autonomous or control model, which is then analytically verified by +means of a function-based proof. However, the synthesis of controllers abiding +by these complex requirements is in general a non-trivial task and may elude +the most expert control engineers. This results in a need for automatic +techniques that are able to design controllers and to analyse a wide range of +elaborate specifications. In this paper, we provide a general framework to +encode system specifications and define corresponding certificates, and we +present an automated approach to formally synthesise controllers and +certificates. Our approach contributes to the broad field of safe learning for +control, exploiting the flexibility of neural networks to provide candidate +control and certificate functions, whilst using SMT-solvers to offer a formal +guarantee of correctness. We test our framework by developing a prototype +software tool, and assess its efficacy at verification via control and +certificate synthesis over a large and varied suite of benchmarks. + +
+
+
+
+
+ + ☆ Measuring Catastrophic Forgetting in Cross-Lingual Transfer Paradigms: + Exploring Tuning Strategies + + +
+ The cross-lingual transfer is a promising technique to solve tasks in +less-resourced languages. In this empirical study, we compare two fine-tuning +approaches combined with zero-shot and full-shot learning approaches for large +language models in a cross-lingual setting. As fine-tuning strategies, we +compare parameter-efficient adapter methods with fine-tuning of all parameters. +As cross-lingual transfer strategies, we compare the intermediate-training +(\textit{IT}) that uses each language sequentially and cross-lingual validation +(\textit{CLV}) that uses a target language already in the validation phase of +fine-tuning. We assess the success of transfer and the extent of catastrophic +forgetting in a source language due to cross-lingual transfer, i.e., how much +previously acquired knowledge is lost when we learn new information in a +different language. The results on two different classification problems, hate +speech detection and product reviews, each containing datasets in several +languages, show that the \textit{IT} cross-lingual strategy outperforms +\textit{CLV} for the target language. Our findings indicate that, in the +majority of cases, the \textit{CLV} strategy demonstrates superior retention of +knowledge in the base language (English) compared to the \textit{IT} strategy, +when evaluating catastrophic forgetting in multiple cross-lingual transfers. + +
+
+
+
+
+ + ☆ Plasticity-Optimized Complementary Networks for Unsupervised Continual + Learning WACV2024 + + +
+ Continuous unsupervised representation learning (CURL) research has greatly +benefited from improvements in self-supervised learning (SSL) techniques. As a +result, existing CURL methods using SSL can learn high-quality representations +without any labels, but with a notable performance drop when learning on a +many-tasks data stream. We hypothesize that this is caused by the +regularization losses that are imposed to prevent forgetting, leading to a +suboptimal plasticity-stability trade-off: they either do not adapt fully to +the incoming data (low plasticity), or incur significant forgetting when +allowed to fully adapt to a new SSL pretext-task (low stability). In this work, +we propose to train an expert network that is relieved of the duty of keeping +the previous knowledge and can focus on performing optimally on the new tasks +(optimizing plasticity). In the second phase, we combine this new knowledge +with the previous network in an adaptation-retrospection phase to avoid +forgetting and initialize a new expert with the knowledge of the old network. +We perform several experiments showing that our proposed approach outperforms +other CURL exemplar-free methods in few- and many-task split settings. +Furthermore, we show how to adapt our approach to semi-supervised continual +learning (Semi-SCL) and show that we surpass the accuracy of other +exemplar-free Semi-SCL methods and reach the results of some others that use +exemplars. + +
+
+ comment: Accepted at WACV2024 +
+
+
+
+
+ + ☆ A Machine Learning Framework to Deconstruct the Primary Drivers for + Electricity Market Price Events + + +
+ Power grids are moving towards 100% renewable energy source bulk power grids, +and the overall dynamics of power system operations and electricity markets are +changing. The electricity markets are not only dispatching resources +economically but also taking into account various controllable actions like +renewable curtailment, transmission congestion mitigation, and energy storage +optimization to ensure grid reliability. As a result, price formations in +electricity markets have become quite complex. Traditional root cause analysis +and statistical approaches are rendered inapplicable to analyze and infer the +main drivers behind price formation in the modern grid and markets with +variable renewable energy (VRE). In this paper, we propose a machine +learning-based analysis framework to deconstruct the primary drivers for price +spike events in modern electricity markets with high renewable energy. The +outcomes can be utilized for various critical aspects of market design, +renewable dispatch and curtailment, operations, and cyber-security +applications. The framework can be applied to any ISO or market data; however, +in this paper, it is applied to open-source publicly available datasets from +California Independent System Operator (CAISO) and ISO New England (ISO-NE). + +
+
+ comment: Published in IEEE PES GM 2023 +
+
+
+
+
+ + ☆ Information Flow in Graph Neural Networks: A Clinical Triage Use Case + + +
+ Graph Neural Networks (GNNs) have gained popularity in healthcare and other +domains due to their ability to process multi-modal and multi-relational +graphs. However, efficient training of GNNs remains challenging, with several +open research questions. In this paper, we investigate how the flow of +embedding information within GNNs affects the prediction of links in Knowledge +Graphs (KGs). Specifically, we propose a mathematical model that decouples the +GNN connectivity from the connectivity of the graph data and evaluate the +performance of GNNs in a clinical triage use case. Our results demonstrate that +incorporating domain knowledge into the GNN connectivity leads to better +performance than using the same connectivity as the KG or allowing +unconstrained embedding propagation. Moreover, we show that negative edges play +a crucial role in achieving good predictions, and that using too many GNN +layers can degrade performance. + +
+
+
+
+
+ + ☆ A2V: A Semi-Supervised Domain Adaptation Framework for Brain Vessel + Segmentation via Two-Phase Training Angiography-to-Venography Translation + + +
+ We present a semi-supervised domain adaptation framework for brain vessel +segmentation from different image modalities. Existing state-of-the-art methods +focus on a single modality, despite the wide range of available cerebrovascular +imaging techniques. This can lead to significant distribution shifts that +negatively impact the generalization across modalities. By relying on annotated +angiographies and a limited number of annotated venographies, our framework +accomplishes image-to-image translation and semantic segmentation, leveraging a +disentangled and semantically rich latent space to represent heterogeneous data +and perform image-level adaptation from source to target domains. Moreover, we +reduce the typical complexity of cycle-based architectures and minimize the use +of adversarial training, which allows us to build an efficient and intuitive +model with stable training. We evaluate our method on magnetic resonance +angiographies and venographies. While achieving state-of-the-art performance in +the source domain, our method attains a Dice score coefficient in the target +domain that is only 8.9% lower, highlighting its promising potential for robust +cerebrovascular image segmentation across different modalities. + +
+
+
+
+
+ + ☆ Selection of contributing factors for predicting landslide + susceptibility using machine learning and deep learning models + + +
+ Landslides are a common natural disaster that can cause casualties, property +safety threats and economic losses. Therefore, it is important to understand or +predict the probability of landslide occurrence at potentially risky sites. A +commonly used means is to carry out a landslide susceptibility assessment based +on a landslide inventory and a set of landslide contributing factors. This can +be readily achieved using machine learning (ML) models such as logistic +regression (LR), support vector machine (SVM), random forest (RF), extreme +gradient boosting (Xgboost), or deep learning (DL) models such as convolutional +neural network (CNN) and long short time memory (LSTM). As the input data for +these models, landslide contributing factors have varying influences on +landslide occurrence. Therefore, it is logically feasible to select more +important contributing factors and eliminate less relevant ones, with the aim +of increasing the prediction accuracy of these models. However, selecting more +important factors is still a challenging task and there is no generally +accepted method. Furthermore, the effects of factor selection using various +methods on the prediction accuracy of ML and DL models are unclear. In this +study, the impact of the selection of contributing factors on the accuracy of +landslide susceptibility predictions using ML and DL models was investigated. +Four methods for selecting contributing factors were considered for all the +aforementioned ML and DL models, which included Information Gain Ratio (IGR), +Recursive Feature Elimination (RFE), Particle Swarm Optimization (PSO), Least +Absolute Shrinkage and Selection Operators (LASSO) and Harris Hawk Optimization +(HHO). In addition, autoencoder-based factor selection methods for DL models +were also investigated. To assess their performances, an exhaustive approach +was adopted,... + +
+
+ comment: Stochastic Environmental Research and Risk Assessment +
+
+
+
+
+ + ☆ Verifiable Fairness: Privacy-preserving Computation of Fairness for + Machine Learning Systems ESORICS'23 + + +
+ Fair machine learning is a thriving and vibrant research topic. In this +paper, we propose Fairness as a Service (FaaS), a secure, verifiable and +privacy-preserving protocol to computes and verify the fairness of any machine +learning (ML) model. In the deisgn of FaaS, the data and outcomes are +represented through cryptograms to ensure privacy. Also, zero knowledge proofs +guarantee the well-formedness of the cryptograms and underlying data. FaaS is +model--agnostic and can support various fairness metrics; hence, it can be used +as a service to audit the fairness of any ML model. Our solution requires no +trusted third party or private channels for the computation of the fairness +metric. The security guarantees and commitments are implemented in a way that +every step is securely transparent and verifiable from the start to the end of +the process. The cryptograms of all input data are publicly available for +everyone, e.g., auditors, social activists and experts, to verify the +correctness of the process. We implemented FaaS to investigate performance and +demonstrate the successful use of FaaS for a publicly available data set with +thousands of entries. + +
+
+ comment: accepted in International Workshop on Private, Secure, and + Trustworthy AI (PriST-AI), ESORICS'23 workshop +
+
+
+
+
+ + ☆ How does representation impact in-context learning: A exploration on a + synthetic task + + +
+ In-context learning, i.e., learning from in-context samples, is an impressive +ability of Transformer. However, the mechanism driving the in-context learning +is not yet fully understood. In this study, we aim to investigate from an +underexplored perspective of representation learning. The representation is +more complex for in-context learning senario, where the representation can be +impacted by both model weights and in-context samples. We refer the above two +conceptually aspects of representation as in-weight component and in-context +component, respectively. To study how the two components affect in-context +learning capabilities, we construct a novel synthetic task, making it possible +to device two probes, in-weights probe and in-context probe, to evaluate the +two components, respectively. We demonstrate that the goodness of in-context +component is highly related to the in-context learning performance, which +indicates the entanglement between in-context learning and representation +learning. Furthermore, we find that a good in-weights component can actually +benefit the learning of the in-context component, indicating that in-weights +learning should be the foundation of in-context learning. To further understand +the the in-context learning mechanism and importance of the in-weights +component, we proof by construction that a simple Transformer, which uses +pattern matching and copy-past mechanism to perform in-context learning, can +match the in-context learning performance with more complex, best tuned +Transformer under the perfect in-weights component assumption. In short, those +discoveries from representation learning perspective shed light on new +approaches to improve the in-context capacity. + +
+
+
+
+
+ + ☆ A Perceptron-based Fine Approximation Technique for Linear Separation + + +
+ This paper presents a novel online learning method that aims at finding a +separator hyperplane between data points labelled as either positive or +negative. Since weights and biases of artificial neurons can directly be +related to hyperplanes in high-dimensional spaces, the technique is applicable +to train perceptron-based binary classifiers in machine learning. In case of +large or imbalanced data sets, use of analytical or gradient-based solutions +can become prohibitive and impractical, where heuristics and approximation +techniques are still applicable. The proposed method is based on the Perceptron +algorithm, however, it tunes neuron weights in just the necessary extent during +searching the separator hyperplane. Due to an appropriate transformation of the +initial data set we need not to consider data labels, neither the bias term. +respectively, reducing separability to a one-class classification problem. The +presented method has proven converge; empirical results show that it can be +more efficient than the Perceptron algorithm, especially, when the size of the +data set exceeds data dimensionality. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ BatMan-CLR: Making Few-shots Meta-Learners Resilient Against Label Noise + + +
+ The negative impact of label noise is well studied in classical supervised +learning yet remains an open research question in meta-learning. Meta-learners +aim to adapt to unseen learning tasks by learning a good initial model in +meta-training and consecutively fine-tuning it according to new tasks during +meta-testing. In this paper, we present the first extensive analysis of the +impact of varying levels of label noise on the performance of state-of-the-art +meta-learners, specifically gradient-based $N$-way $K$-shot learners. We show +that the accuracy of Reptile, iMAML, and foMAML drops by up to 42% on the +Omniglot and CifarFS datasets when meta-training is affected by label noise. To +strengthen the resilience against label noise, we propose two sampling +techniques, namely manifold (Man) and batch manifold (BatMan), which transform +the noisy supervised learners into semi-supervised ones to increase the utility +of noisy labels. We first construct manifold samples of $N$-way +$2$-contrastive-shot tasks through augmentation, learning the embedding via a +contrastive loss in meta-training, and then perform classification through +zeroing on the embedding in meta-testing. We show that our approach can +effectively mitigate the impact of meta-training label noise. Even with 60% +wrong labels \batman and \man can limit the meta-testing accuracy drop to +${2.5}$, ${9.4}$, ${1.1}$ percent points, respectively, with existing +meta-learners across the Omniglot, CifarFS, and MiniImagenet datasets. + +
+
+ comment: 10 pages,3 figures +
+
+
+
+
+ + ☆ Normality Learning-based Graph Anomaly Detection via Multi-Scale + Contrastive Learning ACM MM 2023 + + +
+ Graph anomaly detection (GAD) has attracted increasing attention in machine +learning and data mining. Recent works have mainly focused on how to capture +richer information to improve the quality of node embeddings for GAD. Despite +their significant advances in detection performance, there is still a relative +dearth of research on the properties of the task. GAD aims to discern the +anomalies that deviate from most nodes. However, the model is prone to learn +the pattern of normal samples which make up the majority of samples. Meanwhile, +anomalies can be easily detected when their behaviors differ from normality. +Therefore, the performance can be further improved by enhancing the ability to +learn the normal pattern. To this end, we propose a normality learning-based +GAD framework via multi-scale contrastive learning networks (NLGAD for +abbreviation). Specifically, we first initialize the model with the contrastive +networks on different scales. To provide sufficient and reliable normal nodes +for normality learning, we design an effective hybrid strategy for normality +selection. Finally, the model is refined with the only input of reliable normal +nodes and learns a more accurate estimate of normality so that anomalous nodes +can be more easily distinguished. Eventually, extensive experiments on six +benchmark graph datasets demonstrate the effectiveness of our normality +learning-based scheme on GAD. Notably, the proposed algorithm improves the +detection performance (up to 5.89% AUC gain) compared with the state-of-the-art +methods. The source code is released at https://github.com/FelixDJC/NLGAD. + +
+
+ comment: 10 pages, 7 figures, accepted by ACM MM 2023 +
+
+
+
+
+ + ☆ Energy-Aware Federated Learning with Distributed User Sampling and + Multichannel ALOHA + + +
+ Distributed learning on edge devices has attracted increased attention with +the advent of federated learning (FL). Notably, edge devices often have limited +battery and heterogeneous energy availability, while multiple rounds are +required in FL for convergence, intensifying the need for energy efficiency. +Energy depletion may hinder the training process and the efficient utilization +of the trained model. To solve these problems, this letter considers the +integration of energy harvesting (EH) devices into a FL network with +multi-channel ALOHA, while proposing a method to ensure both low energy outage +probability and successful execution of future tasks. Numerical results +demonstrate the effectiveness of this method, particularly in critical setups +where the average energy income fails to cover the iteration cost. The method +outperforms a norm based solution in terms of convergence time and battery +level. + +
+
+
+
+
+ + ☆ Emergent Communication in Multi-Agent Reinforcement Learning for Future + Wireless Networks + + +
+ In different wireless network scenarios, multiple network entities need to +cooperate in order to achieve a common task with minimum delay and energy +consumption. Future wireless networks mandate exchanging high dimensional data +in dynamic and uncertain environments, therefore implementing communication +control tasks becomes challenging and highly complex. Multi-agent reinforcement +learning with emergent communication (EC-MARL) is a promising solution to +address high dimensional continuous control problems with partially observable +states in a cooperative fashion where agents build an emergent communication +protocol to solve complex tasks. This paper articulates the importance of +EC-MARL within the context of future 6G wireless networks, which imbues +autonomous decision-making capabilities into network entities to solve complex +tasks such as autonomous driving, robot navigation, flying base stations +network planning, and smart city applications. An overview of EC-MARL +algorithms and their design criteria are provided while presenting use cases +and research opportunities on this emerging topic. + +
+
+
+
+
+ + ☆ Interpolation, Approximation and Controllability of Deep Neural Networks + + +
+ We investigate the expressive power of deep residual neural networks +idealized as continuous dynamical systems through control theory. Specifically, +we consider two properties that arise from supervised learning, namely +universal interpolation - the ability to match arbitrary input and target +training samples - and the closely related notion of universal approximation - +the ability to approximate input-target functional relationships via flow maps. +Under the assumption of affine invariance of the control family, we give a +characterisation of universal interpolation, showing that it holds for +essentially any architecture with non-linearity. Furthermore, we elucidate the +relationship between universal interpolation and universal approximation in the +context of general control systems, showing that the two properties cannot be +deduced from each other. At the same time, we identify conditions on the +control family and the target function that ensures the equivalence of the two +notions. + +
+
+
+
+
+ + ☆ ATTA: Anomaly-aware Test-Time Adaptation for Out-of-Distribution + Detection in Segmentation + + +
+ Recent advancements in dense out-of-distribution (OOD) detection have +primarily focused on scenarios where the training and testing datasets share a +similar domain, with the assumption that no domain shift exists between them. +However, in real-world situations, domain shift often exits and significantly +affects the accuracy of existing out-of-distribution (OOD) detection models. In +this work, we propose a dual-level OOD detection framework to handle domain +shift and semantic shift jointly. The first level distinguishes whether domain +shift exists in the image by leveraging global low-level features, while the +second level identifies pixels with semantic shift by utilizing dense +high-level feature maps. In this way, we can selectively adapt the model to +unseen domains as well as enhance model's capacity in detecting novel classes. +We validate the efficacy of our proposed method on several OOD segmentation +benchmarks, including those with significant domain shifts and those without, +observing consistent performance improvements across various baseline models. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Learning Unbiased News Article Representations: A Knowledge-Infused + Approach + + +
+ Quantification of the political leaning of online news articles can aid in +understanding the dynamics of political ideology in social groups and measures +to mitigating them. However, predicting the accurate political leaning of a +news article with machine learning models is a challenging task. This is due to +(i) the political ideology of a news article is defined by several factors, and +(ii) the innate nature of existing learning models to be biased with the +political bias of the news publisher during the model training. There is only a +limited number of methods to study the political leaning of news articles which +also do not consider the algorithmic political bias which lowers the +generalization of machine learning models to predict the political leaning of +news articles published by any new news publishers. In this work, we propose a +knowledge-infused deep learning model that utilizes relatively reliable +external data resources to learn unbiased representations of news articles +using their global and local contexts. We evaluate the proposed model by +setting the data in such a way that news domains or news publishers in the test +set are completely unseen during the training phase. With this setup we show +that the proposed model mitigates algorithmic political bias and outperforms +baseline methods to predict the political leaning of news articles with up to +73% accuracy. + +
+
+
+
+
+ + ☆ CleanUNet 2: A Hybrid Speech Denoising Model on Waveform and Spectrogram INTERSPEECH 2023 + + +
+ In this work, we present CleanUNet 2, a speech denoising model that combines +the advantages of waveform denoiser and spectrogram denoiser and achieves the +best of both worlds. CleanUNet 2 uses a two-stage framework inspired by popular +speech synthesis methods that consist of a waveform model and a spectrogram +model. Specifically, CleanUNet 2 builds upon CleanUNet, the state-of-the-art +waveform denoiser, and further boosts its performance by taking predicted +spectrograms from a spectrogram denoiser as the input. We demonstrate that +CleanUNet 2 outperforms previous methods in terms of various objective and +subjective evaluations. + +
+
+ comment: INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Circuit Breaking: Removing Model Behaviors with Targeted Ablation + + +
+ Language models often exhibit behaviors that improve performance on a +pre-training objective but harm performance on downstream tasks. We propose a +novel approach to removing undesirable behaviors by ablating a small number of +causal pathways between model components, with the intention of disabling the +computational circuit responsible for the bad behavior. Given a small dataset +of inputs where the model behaves poorly, we learn to ablate a small number of +important causal pathways. In the setting of reducing GPT-2 toxic language +generation, we find ablating just 12 of the 11.6K causal edges mitigates toxic +generation with minimal degradation of performance on other inputs. + +
+
+
+
+
+ + ☆ Neural Network Layer Matrix Decomposition reveals Latent Manifold + Encoding and Memory Capacity + + +
+ We prove the converse of the universal approximation theorem, i.e. a neural +network (NN) encoding theorem which shows that for every stably converged NN of +continuous activation functions, its weight matrix actually encodes a +continuous function that approximates its training dataset to within a finite +margin of error over a bounded domain. We further show that using the +Eckart-Young theorem for truncated singular value decomposition of the weight +matrix for every NN layer, we can illuminate the nature of the latent space +manifold of the training dataset encoded and represented by every NN layer, and +the geometric nature of the mathematical operations performed by each NN layer. +Our results have implications for understanding how NNs break the curse of +dimensionality by harnessing memory capacity for expressivity, and that the two +are complementary. This Layer Matrix Decomposition (LMD) further suggests a +close relationship between eigen-decomposition of NN layers and the latest +advances in conceptualizations of Hopfield networks and Transformer NN models. + +
+
+
+
+
+ + ☆ Evaluating the Ebb and Flow: An In-depth Analysis of Question-Answering + Trends across Diverse Platforms + + +
+ Community Question Answering (CQA) platforms steadily gain popularity as they +provide users with fast responses to their queries. The swiftness of these +responses is contingent on a mixture of query-specific and user-related +elements. This paper scrutinizes these contributing factors within the context +of six highly popular CQA platforms, identified through their standout +answering speed. Our investigation reveals a correlation between the time taken +to yield the first response to a question and several variables: the metadata, +the formulation of the questions, and the level of interaction among users. +Additionally, by employing conventional machine learning models to analyze +these metadata and patterns of user interaction, we endeavor to predict which +queries will receive their initial responses promptly. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ GLAD: Content-aware Dynamic Graphs For Log Anomaly Detection + + +
+ Logs play a crucial role in system monitoring and debugging by recording +valuable system information, including events and states. Although various +methods have been proposed to detect anomalies in log sequences, they often +overlook the significance of considering relations among system components, +such as services and users, which can be identified from log contents. +Understanding these relations is vital for detecting anomalies and their +underlying causes. To address this issue, we introduce GLAD, a Graph-based Log +Anomaly Detection framework designed to detect relational anomalies in system +logs. GLAD incorporates log semantics, relational patterns, and sequential +patterns into a unified framework for anomaly detection. Specifically, GLAD +first introduces a field extraction module that utilizes prompt-based few-shot +learning to identify essential fields from log contents. Then GLAD constructs +dynamic log graphs for sliding windows by interconnecting extracted fields and +log events parsed from the log parser. These graphs represent events and fields +as nodes and their relations as edges. Subsequently, GLAD utilizes a +temporal-attentive graph edge anomaly detection model for identifying anomalous +relations in these dynamic log graphs. This model employs a Graph Neural +Network (GNN)-based encoder enhanced with transformers to capture content, +structural and temporal features. We evaluate our proposed method on three +datasets, and the results demonstrate the effectiveness of GLAD in detecting +anomalies indicated by varying relational patterns. + +
+
+ comment: Accepted by ICKG 2023 +
+
+
+
+
+ + ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities across a variety of vision and multimodal +tasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box +setting, requiring access to model parameters for backpropagation. However, +many VLMs rely on proprietary data and are not open-source, which restricts the +use of white-box approaches for fine-tuning. Given that popular private large +language models (LLMs) like ChatGPT still offer a language-based user +interface, we aim to develop a novel fine-tuning approach for VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or output logits. In this setup, we propose employing +chat-based LLMs as black-box optimizers to search for the best text prompt on +the illustrative task of few-shot image classification using CLIP. +Specifically, we adopt an automatic "hill-climbing" procedure that converges on +an effective prompt by evaluating the accuracy of current prompts and asking +LLMs to refine them based on textual feedback, all within a conversational +process without human-in-the-loop. In a challenging 1-shot learning setup, our +simple approach surpasses the white-box continuous prompting method CoOp by an +average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms OpenAI's manually crafted prompts and is more efficient than other +black-box methods like iterative APE. Additionally, we highlight the advantage +of conversational feedback incorporating both positive and negative prompts, +suggesting that LLMs can utilize the implicit "gradient" direction in textual +feedback for a more efficient search. Lastly, we find that the text prompts +generated through our strategy are not only more interpretable but also +transfer well across different CLIP architectures in a black-box manner. + +
+
+
+
+
+ + ☆ Frequency-Aware Masked Autoencoders for Multimodal Pretraining on + Biosignals + + +
+ Leveraging multimodal information from biosignals is vital for building a +comprehensive representation of people's physical and mental states. However, +multimodal biosignals often exhibit substantial distributional shifts between +pretraining and inference datasets, stemming from changes in task specification +or variations in modality compositions. To achieve effective pretraining in the +presence of potential distributional shifts, we propose a frequency-aware +masked autoencoder ($\texttt{bio}$FAME) that learns to parameterize the +representation of biosignals in the frequency space. $\texttt{bio}$FAME +incorporates a frequency-aware transformer, which leverages a fixed-size +Fourier-based operator for global token mixing, independent of the length and +sampling rate of inputs. To maintain the frequency components within each input +channel, we further employ a frequency-maintain pretraining strategy that +performs masked autoencoding in the latent space. The resulting architecture +effectively utilizes multimodal information during pretraining, and can be +seamlessly adapted to diverse tasks and modalities at test time, regardless of +input size and order. We evaluated our approach on a diverse set of transfer +experiments on unimodal time series, achieving an average of $\uparrow$5.5% +improvement in classification accuracy over the previous state-of-the-art. +Furthermore, we demonstrated that our architecture is robust in modality +mismatch scenarios, including unpredicted modality dropout or substitution, +proving its practical utility in real-world applications. Code will be +available soon. + +
+
+
+
+
+ + ☆ On Regularized Sparse Logistic Regression ICDM2023 + + +
+ Sparse logistic regression aims to perform classification and feature +selection simultaneously for high-dimensional data. Although many studies have +been done to solve $\ell_1$-regularized logistic regression, there is no +equivalently abundant literature about solving sparse logistic regression +associated with nonconvex penalties. In this paper, we propose to solve +$\ell_1$-regularized sparse logistic regression and some nonconvex +penalties-regularized sparse logistic regression, when the nonconvex penalties +satisfy some prerequisites, with similar optimization frameworks. In the +proposed optimization frameworks, we utilize different line search criteria to +guarantee good convergence performance for different regularization terms. +Empirical experiments on binary classification tasks with real-world datasets +demonstrate our proposed algorithms are capable of performing classification +and feature selection effectively with a lower computational cost. + +
+
+ comment: Accepted to ICDM2023 +
+
+
+
+
+ + ☆ ACT: Empowering Decision Transformer with Dynamic Programming via + Advantage Conditioning + + +
+ Decision Transformer (DT), which employs expressive sequence modeling +techniques to perform action generation, has emerged as a promising approach to +offline policy optimization. However, DT generates actions conditioned on a +desired future return, which is known to bear some weaknesses such as the +susceptibility to environmental stochasticity. To overcome DT's weaknesses, we +propose to empower DT with dynamic programming. Our method comprises three +steps. First, we employ in-sample value iteration to obtain approximated value +functions, which involves dynamic programming over the MDP structure. Second, +we evaluate action quality in context with estimated advantages. We introduce +two types of advantage estimators, IAE and GAE, which are suitable for +different tasks. Third, we train an Advantage-Conditioned Transformer (ACT) to +generate actions conditioned on the estimated advantages. Finally, during +testing, ACT generates actions conditioned on a desired advantage. Our +evaluation results validate that, by leveraging the power of dynamic +programming, ACT demonstrates effective trajectory stitching and robust action +generation in spite of the environmental stochasticity, outperforming baseline +methods across various benchmarks. Additionally, we conduct an in-depth +analysis of ACT's various design choices through ablation studies. + +
+
+
+
+
+ + ☆ Adversarial Attacks Assessment of Salient Object Detection via Symbolic + Learning + + +
+ Machine learning is at the center of mainstream technology and outperforms +classical approaches to handcrafted feature design. Aside from its learning +process for artificial feature extraction, it has an end-to-end paradigm from +input to output, reaching outstandingly accurate results. However, security +concerns about its robustness to malicious and imperceptible perturbations have +drawn attention since its prediction can be changed entirely. Salient object +detection is a research area where deep convolutional neural networks have +proven effective but whose trustworthiness represents a significant issue +requiring analysis and solutions to hackers' attacks. Brain programming is a +kind of symbolic learning in the vein of good old-fashioned artificial +intelligence. This work provides evidence that symbolic learning robustness is +crucial in designing reliable visual attention systems since it can withstand +even the most intense perturbations. We test this evolutionary computation +methodology against several adversarial attacks and noise perturbations using +standard databases and a real-world problem of a shorebird called the Snowy +Plover portraying a visual attention task. We compare our methodology with five +different deep learning approaches, proving that they do not match the symbolic +paradigm regarding robustness. All neural networks suffer significant +performance losses, while brain programming stands its ground and remains +unaffected. Also, by studying the Snowy Plover, we remark on the importance of +security in surveillance activities regarding wildlife protection and +conservation. + +
+
+ comment: 14 pages, 8 figures, 6 tables, IEEE Transactions on Emerging Topics + in Computing, Accepted for publication +
+
+
+
+
+ + ☆ Hierarchical Conditional Semi-Paired Image-to-Image Translation For + Multi-Task Image Defect Correction On Shopping Websites ICIP 2023 + + +
+ On shopping websites, product images of low quality negatively affect +customer experience. Although there are plenty of work in detecting images with +different defects, few efforts have been dedicated to correct those defects at +scale. A major challenge is that there are thousands of product types and each +has specific defects, therefore building defect specific models is unscalable. +In this paper, we propose a unified Image-to-Image (I2I) translation model to +correct multiple defects across different product types. Our model leverages an +attention mechanism to hierarchically incorporate high-level defect groups and +specific defect types to guide the network to focus on defect-related image +regions. Evaluated on eight public datasets, our model reduces the Frechet +Inception Distance (FID) by 24.6% in average compared with MoNCE, the +state-of-the-art I2I method. Unlike public data, another practical challenge on +shopping websites is that some paired images are of low quality. Therefore we +design our model to be semi-paired by combining the L1 loss of paired data with +the cycle loss of unpaired data. Tested on a shopping website dataset to +correct three image defects, our model reduces (FID) by 63.2% in average +compared with WS-I2I, the state-of-the art semi-paired I2I method. + +
+
+ comment: 6 pages, 6 figures, 3 tables. To be published in ICIP 2023 +
+
+
+
+
+ + ☆ Generalized Attacks on Face Verification Systems + + +
+ Face verification (FV) using deep neural network models has made tremendous +progress in recent years, surpassing human accuracy and seeing deployment in +various applications such as border control and smartphone unlocking. However, +FV systems are vulnerable to Adversarial Attacks, which manipulate input images +to deceive these systems in ways usually unnoticeable to humans. This paper +provides an in-depth study of attacks on FV systems. We introduce the +DodgePersonation Attack that formulates the creation of face images that +impersonate a set of given identities while avoiding being identified as any of +the identities in a separate, disjoint set. A taxonomy is proposed to provide a +unified view of different types of Adversarial Attacks against FV systems, +including Dodging Attacks, Impersonation Attacks, and Master Face Attacks. +Finally, we propose the ''One Face to Rule Them All'' Attack which implements +the DodgePersonation Attack with state-of-the-art performance on a well-known +scenario (Master Face Attack) and which can also be used for the new scenarios +introduced in this paper. While the state-of-the-art Master Face Attack can +produce a set of 9 images to cover 43.82% of the identities in their test +database, with 9 images our attack can cover 57.27% to 58.5% of these +identifies while giving the attacker the choice of the identity to use to +create the impersonation. Moreover, the 9 generated attack images appear +identical to a casual observer. + +
+
+
+
+
+ + ☆ Bregman Graph Neural Network + + +
+ Numerous recent research on graph neural networks (GNNs) has focused on +formulating GNN architectures as an optimization problem with the smoothness +assumption. However, in node classification tasks, the smoothing effect induced +by GNNs tends to assimilate representations and over-homogenize labels of +connected nodes, leading to adverse effects such as over-smoothing and +misclassification. In this paper, we propose a novel bilevel optimization +framework for GNNs inspired by the notion of Bregman distance. We demonstrate +that the GNN layer proposed accordingly can effectively mitigate the +over-smoothing issue by introducing a mechanism reminiscent of the "skip +connection". We validate our theoretical results through comprehensive +empirical studies in which Bregman-enhanced GNNs outperform their original +counterparts in both homophilic and heterophilic graphs. Furthermore, our +experiments also show that Bregman GNNs can produce more robust learning +accuracy even when the number of layers is high, suggesting the effectiveness +of the proposed method in alleviating the over-smoothing issue. + +
+
+
+
+
+ + ☆ Adapt and Diffuse: Sample-adaptive Reconstruction via Latent Diffusion + Models + + +
+ Inverse problems arise in a multitude of applications, where the goal is to +recover a clean signal from noisy and possibly (non)linear observations. The +difficulty of a reconstruction problem depends on multiple factors, such as the +structure of the ground truth signal, the severity of the degradation, the +implicit bias of the reconstruction model and the complex interactions between +the above factors. This results in natural sample-by-sample variation in the +difficulty of a reconstruction task, which is often overlooked by contemporary +techniques. Recently, diffusion-based inverse problem solvers have established +new state-of-the-art in various reconstruction tasks. However, they have the +drawback of being computationally prohibitive. Our key observation in this +paper is that most existing solvers lack the ability to adapt their compute +power to the difficulty of the reconstruction task, resulting in long inference +times, subpar performance and wasteful resource allocation. We propose a novel +method that we call severity encoding, to estimate the degradation severity of +noisy, degraded signals in the latent space of an autoencoder. We show that the +estimated severity has strong correlation with the true corruption level and +can give useful hints at the difficulty of reconstruction problems on a +sample-by-sample basis. Furthermore, we propose a reconstruction method based +on latent diffusion models that leverages the predicted degradation severities +to fine-tune the reverse diffusion sampling trajectory and thus achieve +sample-adaptive inference times. We utilize latent diffusion posterior sampling +to maintain data consistency with observations. We perform experiments on both +linear and nonlinear inverse problems and demonstrate that our technique +achieves performance comparable to state-of-the-art diffusion-based techniques, +with significant improvements in computational efficiency. + +
+
+ comment: 14 pages, 6 figures, preliminary version +
+
+
+
+
+ + ☆ Quantum Data Center: Perspectives + + +
+ A quantum version of data centers might be significant in the quantum era. In +this paper, we introduce Quantum Data Center (QDC), a quantum version of +existing classical data centers, with a specific emphasis on combining Quantum +Random Access Memory (QRAM) and quantum networks. We argue that QDC will +provide significant benefits to customers in terms of efficiency, security, and +precision, and will be helpful for quantum computing, communication, and +sensing. We investigate potential scientific and business opportunities along +this novel research direction through hardware realization and possible +specific applications. We show the possible impacts of QDCs in business and +science, especially the machine learning and big data industries. + +
+
+ comment: 9 pages, many figures. This is a perspective papers introducing the + ideas and impacts of quantum data centers in arXiv:2207.14336 +
+
+
+
+
+ + ☆ $G$-Mapper: Learning a Cover in the Mapper Construction + + +
+ The Mapper algorithm is a visualization technique in topological data +analysis (TDA) that outputs a graph reflecting the structure of a given +dataset. The Mapper algorithm requires tuning several parameters in order to +generate a "nice" Mapper graph. The paper focuses on selecting the cover +parameter. We present an algorithm that optimizes the cover of a Mapper graph +by splitting a cover repeatedly according to a statistical test for normality. +Our algorithm is based on $G$-means clustering which searches for the optimal +number of clusters in $k$-means by conducting iteratively the Anderson-Darling +test. Our splitting procedure employs a Gaussian mixture model in order to +choose carefully the cover based on the distribution of a given data. +Experiments for synthetic and real-world datasets demonstrate that our +algorithm generates covers so that the Mapper graphs retain the essence of the +datasets. + +
+
+
+
+
+ + ☆ Epistemic Modeling Uncertainty of Rapid Neural Network Ensembles for + Adaptive Learning + + +
+ Emulator embedded neural networks, which are a type of physics informed +neural network, leverage multi-fidelity data sources for efficient design +exploration of aerospace engineering systems. Multiple realizations of the +neural network models are trained with different random initializations. The +ensemble of model realizations is used to assess epistemic modeling uncertainty +caused due to lack of training samples. This uncertainty estimation is crucial +information for successful goal-oriented adaptive learning in an aerospace +system design exploration. However, the costs of training the ensemble models +often become prohibitive and pose a computational challenge, especially when +the models are not trained in parallel during adaptive learning. In this work, +a new type of emulator embedded neural network is presented using the rapid +neural network paradigm. Unlike the conventional neural network training that +optimizes the weights and biases of all the network layers by using +gradient-based backpropagation, rapid neural network training adjusts only the +last layer connection weights by applying a linear regression technique. It is +found that the proposed emulator embedded neural network trains +near-instantaneously, typically without loss of prediction accuracy. The +proposed method is demonstrated on multiple analytical examples, as well as an +aerospace flight parameter study of a generic hypersonic vehicle. + +
+
+
+
+
+ + ☆ A Sequentially Fair Mechanism for Multiple Sensitive Attributes + + +
+ In the standard use case of Algorithmic Fairness, the goal is to eliminate +the relationship between a sensitive variable and a corresponding score. +Throughout recent years, the scientific community has developed a host of +definitions and tools to solve this task, which work well in many practical +applications. However, the applicability and effectivity of these tools and +definitions becomes less straightfoward in the case of multiple sensitive +attributes. To tackle this issue, we propose a sequential framework, which +allows to progressively achieve fairness across a set of sensitive features. We +accomplish this by leveraging multi-marginal Wasserstein barycenters, which +extends the standard notion of Strong Demographic Parity to the case with +multiple sensitive characteristics. This method also provides a closed-form +solution for the optimal, sequentially fair predictor, permitting a clear +interpretation of inter-sensitive feature correlations. Our approach seamlessly +extends to approximate fairness, enveloping a framework accommodating the +trade-off between risk and unfairness. This extension permits a targeted +prioritization of fairness improvements for a specific attribute within a set +of sensitive attributes, allowing for a case specific adaptation. A data-driven +estimation procedure for the derived solution is developed, and comprehensive +numerical experiments are conducted on both synthetic and real datasets. Our +empirical findings decisively underscore the practical efficacy of our +post-processing approach in fostering fair decision-making. + +
+
+
+
+
+ + ☆ Accelerating Deep Neural Networks via Semi-Structured Activation + Sparsity + + +
+ The demand for efficient processing of deep neural networks (DNNs) on +embedded devices is a significant challenge limiting their deployment. +Exploiting sparsity in the network's feature maps is one of the ways to reduce +its inference latency. It is known that unstructured sparsity results in lower +accuracy degradation with respect to structured sparsity but the former needs +extensive inference engine changes to get latency benefits. To tackle this +challenge, we propose a solution to induce semi-structured activation sparsity +exploitable through minor runtime modifications. To attain high speedup levels +at inference time, we design a sparse training procedure with awareness of the +final position of the activations while computing the General Matrix +Multiplication (GEMM). We extensively evaluate the proposed solution across +various models for image classification and object detection tasks. Remarkably, +our approach yields a speed improvement of $1.25 \times$ with a minimal +accuracy drop of $1.1\%$ for the ResNet18 model on the ImageNet dataset. +Furthermore, when combined with a state-of-the-art structured pruning method, +the resulting models provide a good latency-accuracy trade-off, outperforming +models that solely employ structured pruning techniques. + +
+
+ comment: Code is available at http://github.com/Deeplite/activ-sparse +
+
+
+
+
+ + ☆ On the Contraction Coefficient of the Schrödinger Bridge for + Stochastic Linear Systems + + +
+ Schr\"{o}dinger bridge is a stochastic optimal control problem to steer a +given initial state density to another, subject to controlled diffusion and +deadline constraints. A popular method to numerically solve the Schr\"{o}dinger +bridge problems, in both classical and in the linear system settings, is via +contractive fixed point recursions. These recursions can be seen as dynamic +versions of the well-known Sinkhorn iterations, and under mild assumptions, +they solve the so-called Schr\"{o}dinger systems with guaranteed linear +convergence. In this work, we study a priori estimates for the contraction +coefficients associated with the convergence of respective Schr\"{o}dinger +systems. We provide new geometric and control-theoretic interpretations for the +same. Building on these newfound interpretations, we point out the possibility +of improved computation for the worst-case contraction coefficients of linear +SBPs by preconditioning the endpoint support sets. + +
+
+
+
+
+ + ☆ RT-LM: Uncertainty-Aware Resource Management for Real-Time Inference of + Language Models + + +
+ Recent advancements in language models (LMs) have gained substantial +attentions on their capability to generate human-like responses. Though +exhibiting a promising future for various applications such as conversation AI, +these LMs face deployment challenges on various devices due to their extreme +computational cost and unpredictable inference latency. Such varied inference +latency, identified as a consequence of uncertainty intrinsic to the nature of +language, can lead to computational inefficiency and degrade the overall +performance of LMs, especially under high-traffic workloads. Unfortunately, the +bandwidth of these uncertainty sources is extensive, complicating the +prediction of latency and the effects emanating from such uncertainties. To +understand and mitigate the impact of uncertainty on real-time +response-demanding systems, we take the first step to comprehend, quantify and +optimize these uncertainty-induced latency performance variations in LMs. +Specifically, we present RT-LM, an uncertainty-aware resource management +ecosystem for real-time inference of LMs. RT-LM innovatively quantifies how +specific input uncertainties, adversely affect latency, often leading to an +increased output length. Exploiting these insights, we devise a lightweight yet +effective method to dynamically correlate input text uncertainties with output +length at runtime. Utilizing this quantification as a latency heuristic, we +integrate the uncertainty information into a system-level scheduler which +explores several uncertainty-induced optimization opportunities, including +uncertainty-aware prioritization, dynamic consolidation, and strategic CPU +offloading. Quantitative experiments across five state-of-the-art LMs on two +hardware platforms demonstrates that RT-LM can significantly reduce the average +response time and improve throughput while incurring a rather small runtime +overhead. + +
+
+ comment: Accepted by RTSS 2023 +
+
+
+
+
+ + ☆ Unsupervised Learning of Nanoindentation Data to Infer Microstructural + Details of Complex Materials + + +
+ In this study, Cu-Cr composites were studied by nanoindentation. Arrays of +indents were placed over large areas of the samples resulting in datasets +consisting of several hundred measurements of Young's modulus and hardness at +varying indentation depths. The unsupervised learning technique, Gaussian +mixture model, was employed to analyze the data, which helped to determine the +number of "mechanical phases" and the respective mechanical properties. +Additionally, a cross-validation approach was introduced to infer whether the +data quantity was adequate and to suggest the amount of data required for +reliable predictions -- one of the often encountered but difficult to resolve +issues in machine learning of materials science problems. + +
+
+
+
+
+ + ☆ Harmonic-NAS: Hardware-Aware Multimodal Neural Architecture Search on + Resource-constrained Devices ACML 2023 + + +
+ The recent surge of interest surrounding Multimodal Neural Networks (MM-NN) +is attributed to their ability to effectively process and integrate information +from diverse data sources. In MM-NN, features are extracted and fused from +multiple modalities using adequate unimodal backbones and specific fusion +networks. Although this helps strengthen the multimodal information +representation, designing such networks is labor-intensive. It requires tuning +the architectural parameters of the unimodal backbones, choosing the fusing +point, and selecting the operations for fusion. Furthermore, multimodality AI +is emerging as a cutting-edge option in Internet of Things (IoT) systems where +inference latency and energy consumption are critical metrics in addition to +accuracy. In this paper, we propose Harmonic-NAS, a framework for the joint +optimization of unimodal backbones and multimodal fusion networks with hardware +awareness on resource-constrained devices. Harmonic-NAS involves a two-tier +optimization approach for the unimodal backbone architectures and fusion +strategy and operators. By incorporating the hardware dimension into the +optimization, evaluation results on various devices and multimodal datasets +have demonstrated the superiority of Harmonic-NAS over state-of-the-art +approaches achieving up to 10.9% accuracy improvement, 1.91x latency reduction, +and 2.14x energy efficiency gain. + +
+
+ comment: Accepted to the 15th Asian Conference on Machine Learning (ACML 2023) +
+
+
+
+
+ + ☆ Hybrid Algorithm Selection and Hyperparameter Tuning on Distributed + Machine Learning Resources: A Hierarchical Agent-based Approach + + +
+ Algorithm selection and hyperparameter tuning are critical steps in both +academic and applied machine learning. On the other hand, these steps are +becoming ever increasingly delicate due to the extensive rise in the number, +diversity, and distributedness of machine learning resources. Multi-agent +systems, when applied to the design of machine learning platforms, bring about +several distinctive characteristics such as scalability, flexibility, and +robustness, just to name a few. This paper proposes a fully automatic and +collaborative agent-based mechanism for selecting distributedly organized +machine learning algorithms and simultaneously tuning their hyperparameters. +Our method builds upon an existing agent-based hierarchical machine-learning +platform and augments its query structure to support the aforementioned +functionalities without being limited to specific learning, selection, and +tuning mechanisms. We have conducted theoretical assessments, formal +verification, and analytical study to demonstrate the correctness, resource +utilization, and computational efficiency of our technique. According to the +results, our solution is totally correct and exhibits linear time and space +complexity in relation to the size of available resources. To provide concrete +examples of how the proposed methodologies can effectively adapt and perform +across a range of algorithmic options and datasets, we have also conducted a +series of experiments using a system comprised of 24 algorithms and 9 datasets. + +
+
+
+
+
+ + ☆ Reasoning with Latent Diffusion in Offline Reinforcement Learning + + +
+ Offline reinforcement learning (RL) holds promise as a means to learn +high-reward policies from a static dataset, without the need for further +environment interactions. However, a key challenge in offline RL lies in +effectively stitching portions of suboptimal trajectories from the static +dataset while avoiding extrapolation errors arising due to a lack of support in +the dataset. Existing approaches use conservative methods that are tricky to +tune and struggle with multi-modal data (as we show) or rely on noisy Monte +Carlo return-to-go samples for reward conditioning. In this work, we propose a +novel approach that leverages the expressiveness of latent diffusion to model +in-support trajectory sequences as compressed latent skills. This facilitates +learning a Q-function while avoiding extrapolation error via +batch-constraining. The latent space is also expressive and gracefully copes +with multi-modal data. We show that the learned temporally-abstract latent +space encodes richer task-specific information for offline RL tasks as compared +to raw state-actions. This improves credit assignment and facilitates faster +reward propagation during Q-learning. Our method demonstrates state-of-the-art +performance on the D4RL benchmarks, particularly excelling in long-horizon, +sparse-reward tasks. + +
+
+
+
+
+ + ☆ Rank2Tell: A Multimodal Driving Dataset for Joint Importance Ranking and + Reasoning + + +
+ The widespread adoption of commercial autonomous vehicles (AVs) and advanced +driver assistance systems (ADAS) may largely depend on their acceptance by +society, for which their perceived trustworthiness and interpretability to +riders are crucial. In general, this task is challenging because modern +autonomous systems software relies heavily on black-box artificial intelligence +models. Towards this goal, this paper introduces a novel dataset, Rank2Tell, a +multi-modal ego-centric dataset for Ranking the importance level and Telling +the reason for the importance. Using various close and open-ended visual +question answering, the dataset provides dense annotations of various semantic, +spatial, temporal, and relational attributes of various important objects in +complex traffic scenarios. The dense annotations and unique attributes of the +dataset make it a valuable resource for researchers working on visual scene +understanding and related fields. Further, we introduce a joint model for joint +importance level ranking and natural language captions generation to benchmark +our dataset and demonstrate performance with quantitative evaluations. + +
+
+
+
+
+ + ☆ Convergence of Gradient-based MAML in LQR + + +
+ The main objective of this research paper is to investigate the local +convergence characteristics of Model-agnostic Meta-learning (MAML) when applied +to linear system quadratic optimal control (LQR). MAML and its variations have +become popular techniques for quickly adapting to new tasks by leveraging +previous learning knowledge in areas like regression, classification, and +reinforcement learning. However, its theoretical guarantees remain unknown due +to non-convexity and its structure, making it even more challenging to ensure +stability in the dynamic system setting. This study focuses on exploring MAML +in the LQR setting, providing its local convergence guarantees while +maintaining the stability of the dynamical system. The paper also presents +simple numerical results to demonstrate the convergence properties of MAML in +LQR tasks. + +
+
+
+
+
+ + ☆ Explainable Graph Neural Network for Alzheimer's Disease And Related + Dementias Risk Prediction + + +
+ Alzheimer's disease and related dementias (ADRD) ranks as the sixth leading +cause of death in the US, underlining the importance of accurate ADRD risk +prediction. While recent advancement in ADRD risk prediction have primarily +relied on imaging analysis, yet not all patients undergo medical imaging before +an ADRD diagnosis. Merging machine learning with claims data can reveal +additional risk factors and uncover interconnections among diverse medical +codes. Our goal is to utilize Graph Neural Networks (GNNs) with claims data for +ADRD risk prediction. Addressing the lack of human-interpretable reasons behind +these predictions, we introduce an innovative method to evaluate relationship +importance and its influence on ADRD risk prediction, ensuring comprehensive +interpretation. + We employed Variationally Regularized Encoder-decoder Graph Neural Network +(VGNN) for estimating ADRD likelihood. We created three scenarios to assess the +model's efficiency, using Random Forest and Light Gradient Boost Machine as +baselines. We further used our relation importance method to clarify the key +relationships for ADRD risk prediction. VGNN surpassed other baseline models by +10% in the area under the receiver operating characteristic. The integration of +the GNN model and relation importance interpretation could potentially play an +essential role in providing valuable insight into factors that may contribute +to or delay ADRD progression. + Employing a GNN approach with claims data enhances ADRD risk prediction and +provides insights into the impact of interconnected medical code relationships. +This methodology not only enables ADRD risk modeling but also shows potential +for other image analysis predictions using claims data. + +
+
+
+
+
+ + ☆ Electron Energy Regression in the CMS High-Granularity Calorimeter + Prototype + + +
+ We present a new publicly available dataset that contains simulated data of a +novel calorimeter to be installed at the CERN Large Hadron Collider. This +detector will have more than six-million channels with each channel capable of +position, ionisation and precision time measurement. Reconstructing these +events in an efficient way poses an immense challenge which is being addressed +with the latest machine learning techniques. As part of this development a +large prototype with 12,000 channels was built and a beam of high-energy +electrons incident on it. Using machine learning methods we have reconstructed +the energy of incident electrons from the energies of three-dimensional hits, +which is known to some precision. By releasing this data publicly we hope to +encourage experts in the application of machine learning to develop efficient +and accurate image reconstruction of these electrons. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Promises of Deep Kernel Learning for Control Synthesis + + +
+ Deep Kernel Learning (DKL) combines the representational power of neural +networks with the uncertainty quantification of Gaussian Processes. Hence, it +is potentially a promising tool to learn and control complex dynamical systems. +In this work, we develop a scalable abstraction-based framework that enables +the use of DKL for control synthesis of stochastic dynamical systems against +complex specifications. Specifically, we consider temporal logic specifications +and create an end-to-end framework that uses DKL to learn an unknown system +from data and formally abstracts the DKL model into an Interval Markov Decision +Process (IMDP) to perform control synthesis with correctness guarantees. +Furthermore, we identify a deep architecture that enables accurate learning and +efficient abstraction computation. The effectiveness of our approach is +illustrated on various benchmarks, including a 5-D nonlinear stochastic system, +showing how control synthesis with DKL can substantially outperform +state-of-the-art competitive methods. + +
+
+ comment: 9 pages, 4 figures, 3 tables +
+
+
+
+
+ + ☆ Commands as AI Conversations + + +
+ Developers and data scientists often struggle to write command-line inputs, +even though graphical interfaces or tools like ChatGPT can assist. The +solution? "ai-cli," an open-source system inspired by GitHub Copilot that +converts natural language prompts into executable commands for various Linux +command-line tools. By tapping into OpenAI's API, which allows interaction +through JSON HTTP requests, "ai-cli" transforms user queries into actionable +command-line instructions. However, integrating AI assistance across multiple +command-line tools, especially in open source settings, can be complex. +Historically, operating systems could mediate, but individual tool +functionality and the lack of a unified approach have made centralized +integration challenging. The "ai-cli" tool, by bridging this gap through +dynamic loading and linking with each program's Readline library API, makes +command-line interfaces smarter and more user-friendly, opening avenues for +further enhancement and cross-platform applicability. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Distributionally Robust Transfer Learning + + +
+ Many existing transfer learning methods rely on leveraging information from +source data that closely resembles the target data. However, this approach +often overlooks valuable knowledge that may be present in different yet +potentially related auxiliary samples. When dealing with a limited amount of +target data and a diverse range of source models, our paper introduces a novel +approach, Distributionally Robust Optimization for Transfer Learning +(TransDRO), that breaks free from strict similarity constraints. TransDRO is +designed to optimize the most adversarial loss within an uncertainty set, +defined as a collection of target populations generated as a convex combination +of source distributions that guarantee excellent prediction performances for +the target data. TransDRO effectively bridges the realms of transfer learning +and distributional robustness prediction models. We establish the +identifiability of TransDRO and its interpretation as a weighted average of +source models closest to the baseline model. We also show that TransDRO +achieves a faster convergence rate than the model fitted with the target data. +Our comprehensive numerical studies and analysis of multi-institutional +electronic health records data using TransDRO further substantiate the +robustness and accuracy of TransDRO, highlighting its potential as a powerful +tool in transfer learning applications. + +
+
+
+
+
+ + ☆ Hierarchical Multi-Task Learning Framework for Session-based + Recommendations RecSys 2023 + + +
+ While session-based recommender systems (SBRSs) have shown superior +recommendation performance, multi-task learning (MTL) has been adopted by SBRSs +to enhance their prediction accuracy and generalizability further. Hierarchical +MTL (H-MTL) sets a hierarchical structure between prediction tasks and feeds +outputs from auxiliary tasks to main tasks. This hierarchy leads to richer +input features for main tasks and higher interpretability of predictions, +compared to existing MTL frameworks. However, the H-MTL framework has not been +investigated in SBRSs yet. In this paper, we propose HierSRec which +incorporates the H-MTL architecture into SBRSs. HierSRec encodes a given +session with a metadata-aware Transformer and performs next-category prediction +(i.e., auxiliary task) with the session encoding. Next, HierSRec conducts +next-item prediction (i.e., main task) with the category prediction result and +session encoding. For scalable inference, HierSRec creates a compact set of +candidate items (e.g., 4% of total items) per test example using the category +prediction. Experiments show that HierSRec outperforms existing SBRSs as per +next-item prediction accuracy on two session-based recommendation datasets. The +accuracy of HierSRec measured with the carefully-curated candidate items aligns +with the accuracy of HierSRec calculated with all items, which validates the +usefulness of our candidate generation scheme via H-MTL. + +
+
+ comment: Accepted at the 6th Workshop on Online Recommender Systems and User + Modeling @ ACM RecSys 2023 +
+
+
+
+
+ + ☆ Exploring the Benefits of Differentially Private Pre-training and + Parameter-Efficient Fine-tuning for Table Transformers ICASSP 2024 + + +
+ For machine learning with tabular data, Table Transformer (TabTransformer) is +a state-of-the-art neural network model, while Differential Privacy (DP) is an +essential component to ensure data privacy. In this paper, we explore the +benefits of combining these two aspects together in the scenario of transfer +learning -- differentially private pre-training and fine-tuning of +TabTransformers with a variety of parameter-efficient fine-tuning (PEFT) +methods, including Adapter, LoRA, and Prompt Tuning. Our extensive experiments +on the ACSIncome dataset show that these PEFT methods outperform traditional +approaches in terms of the accuracy of the downstream task and the number of +trainable parameters, thus achieving an improved trade-off among parameter +efficiency, privacy, and accuracy. Our code is available at +github.com/IBM/DP-TabTransformer. + +
+
+ comment: submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ A Q-learning Approach for Adherence-Aware Recommendations + + +
+ In many real-world scenarios involving high-stakes and safety implications, a +human decision-maker (HDM) may receive recommendations from an artificial +intelligence while holding the ultimate responsibility of making decisions. In +this letter, we develop an "adherence-aware Q-learning" algorithm to address +this problem. The algorithm learns the "adherence level" that captures the +frequency with which an HDM follows the recommended actions and derives the +best recommendation policy in real time. We prove the convergence of the +proposed Q-learning algorithm to the optimal value and evaluate its performance +across various scenarios. + +
+
+
+
+
+ + ☆ Leveraging Large Language Models and Weak Supervision for Social Media + data annotation: an evaluation using COVID-19 self-reported vaccination + tweets + + +
+ The COVID-19 pandemic has presented significant challenges to the healthcare +industry and society as a whole. With the rapid development of COVID-19 +vaccines, social media platforms have become a popular medium for discussions +on vaccine-related topics. Identifying vaccine-related tweets and analyzing +them can provide valuable insights for public health research-ers and +policymakers. However, manual annotation of a large number of tweets is +time-consuming and expensive. In this study, we evaluate the usage of Large +Language Models, in this case GPT-4 (March 23 version), and weak supervision, +to identify COVID-19 vaccine-related tweets, with the purpose of comparing +performance against human annotators. We leveraged a manu-ally curated +gold-standard dataset and used GPT-4 to provide labels without any additional +fine-tuning or instructing, in a single-shot mode (no additional prompting). + +
+
+
+
+
+ + ☆ A Distributed Data-Parallel PyTorch Implementation of the Distributed + Shampoo Optimizer for Training Neural Networks At-Scale + + +
+ Shampoo is an online and stochastic optimization algorithm belonging to the +AdaGrad family of methods for training neural networks. It constructs a +block-diagonal preconditioner where each block consists of a coarse Kronecker +product approximation to full-matrix AdaGrad for each parameter of the neural +network. In this work, we provide a complete description of the algorithm as +well as the performance optimizations that our implementation leverages to +train deep networks at-scale in PyTorch. Our implementation enables fast +multi-GPU distributed data-parallel training by distributing the memory and +computation associated with blocks of each parameter via PyTorch's DTensor data +structure and performing an AllGather primitive on the computed search +directions at each iteration. This major performance enhancement enables us to +achieve at most a 10% performance reduction in per-step wall-clock time +compared against standard diagonal-scaling-based adaptive gradient methods. We +validate our implementation by performing an ablation study on training +ImageNet ResNet50, demonstrating Shampoo's superiority over standard training +recipes with minimal hyperparameter tuning. + +
+
+ comment: 38 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ Learning topological operations on meshes with application to block + decomposition of polygons + + +
+ We present a learning based framework for mesh quality improvement on +unstructured triangular and quadrilateral meshes. Our model learns to improve +mesh quality according to a prescribed objective function purely via self-play +reinforcement learning with no prior heuristics. The actions performed on the +mesh are standard local and global element operations. The goal is to minimize +the deviation of the node degrees from their ideal values, which in the case of +interior vertices leads to a minimization of irregular nodes. + +
+
+ comment: Submitted to Computer-Aided Design Journal. Presented at 17th US + National Conference on Computational Mechanics, Albuquerque, NM +
+
+
+
+
+ + ☆ Flows for Flows: Morphing one Dataset into another with Maximum + Likelihood Estimation + + +
+ Many components of data analysis in high energy physics and beyond require +morphing one dataset into another. This is commonly solved via reweighting, but +there are many advantages of preserving weights and shifting the data points +instead. Normalizing flows are machine learning models with impressive +precision on a variety of particle physics tasks. Naively, normalizing flows +cannot be used for morphing because they require knowledge of the probability +density of the starting dataset. In most cases in particle physics, we can +generate more examples, but we do not know densities explicitly. We propose a +protocol called flows for flows for training normalizing flows to morph one +dataset into another even if the underlying probability density of neither +dataset is known explicitly. This enables a morphing strategy trained with +maximum likelihood estimation, a setup that has been shown to be highly +effective in related tasks. We study variations on this protocol to explore how +far the data points are moved to statistically match the two datasets. +Furthermore, we show how to condition the learned flows on particular features +in order to create a morphing function for every value of the conditioning +feature. For illustration, we demonstrate flows for flows for toy examples as +well as a collider physics example involving dijet events + +
+
+ comment: 15 pages, 17 figures. This work is a merger of arXiv:2211.02487 and + arXiv:2212.06155 +
+
+
+
+
+ + ☆ Unveiling the potential of large language models in generating semantic + and cross-language clones SC + + +
+ Semantic and Cross-language code clone generation may be useful for code +reuse, code comprehension, refactoring and benchmarking. OpenAI's GPT model has +potential in such clone generation as GPT is used for text generation. When +developers copy/paste codes from Stack Overflow (SO) or within a system, there +might be inconsistent changes leading to unexpected behaviours. Similarly, if +someone possesses a code snippet in a particular programming language but seeks +equivalent functionality in a different language, a semantic cross-language +code clone generation approach could provide valuable assistance. In this +study, using SemanticCloneBench as a vehicle, we evaluated how well the GPT-3 +model could help generate semantic and cross-language clone variants for a +given fragment.We have comprised a diverse set of code fragments and assessed +GPT-3s performance in generating code variants.Through extensive +experimentation and analysis, where 9 judges spent 158 hours to validate, we +investigate the model's ability to produce accurate and semantically correct +variants. Our findings shed light on GPT-3's strengths in code generation, +offering insights into the potential applications and challenges of using +advanced language models in software development. Our quantitative analysis +yields compelling results. In the realm of semantic clones, GPT-3 attains an +impressive accuracy of 62.14% and 0.55 BLEU score, achieved through few-shot +prompt engineering. Furthermore, the model shines in transcending linguistic +confines, boasting an exceptional 91.25% accuracy in generating cross-language +clones + +
+
+ comment: Accepted in IWSC +
+
+
+
+
+ + ♻ ☆ Measuring Self-Supervised Representation Quality for Downstream + Classification using Discriminative Features + + +
+ Self-supervised learning (SSL) has shown impressive results in downstream +classification tasks. However, there is limited work in understanding their +failure modes and interpreting their learned representations. In this paper, we +study the representation space of state-of-the-art self-supervised models +including SimCLR, SwaV, MoCo, BYOL, DINO, SimSiam, VICReg and Barlow Twins. +Without the use of class label information, we discover discriminative features +that correspond to unique physical attributes in images, present mostly in +correctly-classified representations. Using these features, we can compress the +representation space by up to 40% without significantly affecting linear +classification performance. We then propose Self-Supervised Representation +Quality Score (or Q-Score), an unsupervised score that can reliably predict if +a given sample is likely to be mis-classified during linear evaluation, +achieving AUPRC of 91.45 on ImageNet-100 and 78.78 on ImageNet-1K. Q-Score can +also be used as a regularization term on pre-trained encoders to remedy +low-quality representations. Fine-tuning with Q-Score regularization can boost +the linear probing accuracy of SSL models by up to 5.8% on ImageNet-100 and +3.7% on ImageNet-1K compared to their baselines. Finally, using gradient +heatmaps and Salient ImageNet masks, we define a metric to quantify the +interpretability of each representation. We show that discriminative features +are strongly correlated to core attributes and, enhancing these features +through Q-score regularization makes SSL representations more interpretable. + +
+
+
+
+
+ + ♻ ☆ Brand New K-FACs: Speeding up K-FAC with Online Decomposition Updates + + +
+ K-FAC (arXiv:1503.05671, arXiv:1602.01407) is a tractable implementation of +Natural Gradient (NG) for Deep Learning (DL), whose bottleneck is computing the +inverses of the so-called ``Kronecker-Factors'' (K-factors). RS-KFAC +(arXiv:2206.15397) is a K-FAC improvement which provides a cheap way of +estimating the K-factors inverses. + In this paper, we exploit the exponential-average construction paradigm of +the K-factors, and use online numerical linear algebra techniques to propose an +even cheaper (but less accurate) way of estimating the K-factors inverses. In +particular, we propose a K-factor inverse update which scales linearly in layer +size. We also propose an inverse application procedure which scales linearly as +well (the one of K-FAC scales cubically and the one of RS-KFAC scales +quadratically). Overall, our proposed algorithm gives an approximate K-FAC +implementation whose preconditioning part scales linearly in layer size +(compare to cubic for K-FAC and quadratic for RS-KFAC). Importantly however, +this update is only applicable in some circumstances (typically for all FC +layers), unlike the RS-KFAC approach (arXiv:2206.15397). + Numerical results show RS-KFAC's inversion error can be reduced with minimal +CPU overhead by adding our proposed update to it. Based on the proposed +procedure, a correction to it, and RS-KFAC, we propose three practical +algorithms for optimizing generic Deep Neural Nets. Numerical results show that +two of these outperform RS-KFAC for any target test accuracy on CIFAR10 +classification with a slightly modified version of VGG16_bn. Our proposed +algorithms achieve 91$\%$ test accuracy faster than SENG (the state of art +implementation of empirical NG for DL; arXiv:2006.05924) but underperform it +for higher test-accuracy. + +
+
+ comment: Version 2 (new numerical experiments coming soon, in V3) +
+
+
+
+
+ + ♻ ☆ PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel + + +
+ It is widely acknowledged that large models have the potential to deliver +superior performance across a broad range of domains. Despite the remarkable +progress made in the field of machine learning systems research, which has +enabled the development and exploration of large models, such abilities remain +confined to a small group of advanced users and industry leaders, resulting in +an implicit technical barrier for the wider community to access and leverage +these technologies. In this paper, we introduce PyTorch Fully Sharded Data +Parallel (FSDP) as an industry-grade solution for large model training. FSDP +has been closely co-designed with several key PyTorch core components including +Tensor implementation, dispatcher system, and CUDA memory caching allocator, to +provide non-intrusive user experiences and high training efficiency. +Additionally, FSDP natively incorporates a range of techniques and settings to +optimize resource utilization across a variety of hardware configurations. The +experimental results demonstrate that FSDP is capable of achieving comparable +performance to Distributed Data Parallel while providing support for +significantly larger models with near-linear scalability in terms of TFLOPS. + +
+
+
+
+
+ + ♻ ☆ GTAdam: Gradient Tracking with Adaptive Momentum for Distributed Online + Optimization + + +
+ This paper deals with a network of computing agents aiming to solve an online +optimization problem in a distributed fashion, i.e., by means of local +computation and communication, without any central coordinator. We propose the +gradient tracking with adaptive momentum estimation (GTAdam) distributed +algorithm, which combines a gradient tracking mechanism with first and second +order momentum estimates of the gradient. The algorithm is analyzed in the +online setting for strongly convex cost functions with Lipschitz continuous +gradients. We provide an upper bound for the dynamic regret given by a term +related to the initial conditions and another term related to the temporal +variations of the objective functions. Moreover, a linear convergence rate is +guaranteed in the static setup. The algorithm is tested on a time-varying +classification problem, on a (moving) target localization problem, and in a +stochastic optimization setup from image classification. In these numerical +experiments from multi-agent learning, GTAdam outperforms state-of-the-art +distributed optimization methods. + +
+
+
+
+
+ + ♻ ☆ Robust Markov Decision Processes without Model Estimation + + +
+ Robust Markov Decision Processes (MDPs) are receiving much attention in +learning a robust policy which is less sensitive to environment changes. There +are an increasing number of works analyzing sample-efficiency of robust MDPs. +However, there are two major barriers to applying robust MDPs in practice. +First, most works study robust MDPs in a model-based regime, where the +transition probability needs to be estimated and requires a large amount of +memories $\mathcal{O}(|\mathcal{S}|^2|\mathcal{A}|)$. Second, prior work +typically assumes a strong oracle to obtain the optimal solution as an +intermediate step to solve robust MDPs. However, in practice, such an oracle +does not exist usually. To remove the oracle, we transform the original robust +MDPs into an alternative form, which allows us to use stochastic gradient +methods to solve the robust MDPs. Moreover, we prove the alternative form still +plays a similar role as the original form. With this new formulation, we devise +a sample-efficient algorithm to solve the robust MDPs in a model-free regime, +which does not require an oracle and trades off a lower storage requirement +$\mathcal{O}(|\mathcal{S}||\mathcal{A}|)$ with being able to generate samples +from a generative model or Markovian chain. Finally, we validate our +theoretical findings via numerical experiments, showing the efficiency with the +alternative form of robust MDPs. + +
+
+
+
+
+ + ♻ ☆ Plant Disease Detection using Region-Based Convolutional Neural Network + + +
+ Agriculture plays an important role in the food and economy of Bangladesh. +The rapid growth of population over the years also has increased the demand for +food production. One of the major reasons behind low crop production is +numerous bacteria, virus and fungal plant diseases. Early detection of plant +diseases and proper usage of pesticides and fertilizers are vital for +preventing the diseases and boost the yield. Most of the farmers use +generalized pesticides and fertilizers in the entire fields without +specifically knowing the condition of the plants. Thus the production cost +oftentimes increases, and, not only that, sometimes this becomes detrimental to +the yield. Deep Learning models are found to be very effective to automatically +detect plant diseases from images of plants, thereby reducing the need for +human specialists. This paper aims at building a lightweight deep learning +model for predicting leaf disease in tomato plants. By modifying the +region-based convolutional neural network, we design an efficient and effective +model that demonstrates satisfactory empirical performance on a benchmark +dataset. Our proposed model can easily be deployed in a larger system where +drones take images of leaves and these images will be fed into our model to +know the health condition. + +
+
+ comment: 23 pages +
+
+
+
+
+ + ♻ ☆ Tradeoff of generalization error in unsupervised learning + + +
+ Finding the optimal model complexity that minimizes the generalization error +(GE) is a key issue of machine learning. For the conventional supervised +learning, this task typically involves the bias-variance tradeoff: lowering the +bias by making the model more complex entails an increase in the variance. +Meanwhile, little has been studied about whether the same tradeoff exists for +unsupervised learning. In this study, we propose that unsupervised learning +generally exhibits a two-component tradeoff of the GE, namely the model error +and the data error -- using a more complex model reduces the model error at the +cost of the data error, with the data error playing a more significant role for +a smaller training dataset. This is corroborated by training the restricted +Boltzmann machine to generate the configurations of the two-dimensional Ising +model at a given temperature and the totally asymmetric simple exclusion +process with given entry and exit rates. Our results also indicate that the +optimal model tends to be more complex when the data to be learned are more +complex. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ ProbVLM: Probabilistic Adapter for Frozen Vison-Language Models ICCV 2023 + + +
+ Large-scale vision-language models (VLMs) like CLIP successfully find +correspondences between images and text. Through the standard deterministic +mapping process, an image or a text sample is mapped to a single vector in the +embedding space. This is problematic: as multiple samples (images or text) can +abstract the same concept in the physical world, deterministic embeddings do +not reflect the inherent ambiguity in the embedding space. We propose ProbVLM, +a probabilistic adapter that estimates probability distributions for the +embeddings of pre-trained VLMs via inter/intra-modal alignment in a post-hoc +manner without needing large-scale datasets or computing. On four challenging +datasets, i.e., COCO, Flickr, CUB, and Oxford-flowers, we estimate the +multi-modal embedding uncertainties for two VLMs, i.e., CLIP and BLIP, quantify +the calibration of embedding uncertainties in retrieval tasks and show that +ProbVLM outperforms other methods. Furthermore, we propose active learning and +model selection as two real-world downstream tasks for VLMs and show that the +estimated uncertainty aids both tasks. Lastly, we present a novel technique for +visualizing the embedding distributions using a large-scale pre-trained latent +diffusion model. Code is available at https://github.com/ExplainableML/ProbVLM. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Stability to Deformations of Manifold Filters and Manifold Neural + Networks + + +
+ The paper defines and studies manifold (M) convolutional filters and neural +networks (NNs). \emph{Manifold} filters and MNNs are defined in terms of the +Laplace-Beltrami operator exponential and are such that \emph{graph} (G) +filters and neural networks (NNs) are recovered as discrete approximations when +the manifold is sampled. These filters admit a spectral representation which is +a generalization of both the spectral representation of graph filters and the +frequency response of standard convolutional filters in continuous time. The +main technical contribution of the paper is to analyze the stability of +manifold filters and MNNs to smooth deformations of the manifold. This analysis +generalizes known stability properties of graph filters and GNNs and it is also +a generalization of known stability properties of standard convolutional +filters and neural networks in continuous time. The most important observation +that follows from this analysis is that manifold filters, same as graph filters +and standard continuous time filters, have difficulty discriminating high +frequency components in the presence of deformations. This is a challenge that +can be ameliorated with the use of manifold, graph, or continuous time neural +networks. The most important practical consequence of this analysis is to shed +light on the behavior of graph filters and GNNs in large scale graphs. + +
+
+ comment: 19 pages; 6 figures +
+
+
+
+
+ + ♻ ☆ Flooding with Absorption: An Efficient Protocol for Heterogeneous + Bandits over Complex Networks + + +
+ Multi-armed bandits are extensively used to model sequential decision-making, +making them ubiquitous in many real-life applications such as online +recommender systems and wireless networking. We consider a multi-agent setting +where each agent solves their own bandit instance endowed with a different set +of arms. Their goal is to minimize their group regret while collaborating via +some communication protocol over a given network. Previous literature on this +problem only considered arm heterogeneity and networked agents separately. In +this work, we introduce a setting that encompasses both features. For this +novel setting, we first provide a rigorous regret analysis for a standard +flooding protocol combined with the classic UCB policy. Then, to mitigate the +issue of high communication costs incurred by flooding in complex networks, we +propose a new protocol called Flooding with Absorption (FwA). We provide a +theoretical analysis of the resulting regret bound and discuss the advantages +of using FwA over flooding. Lastly, we experimentally verify on various +scenarios, including dynamic networks, that FwA leads to significantly lower +communication costs despite minimal regret performance loss compared to other +network protocols. + +
+
+ comment: 26 pages, 7 figures; second revision +
+
+
+
+
+ + ♻ ☆ Navigating Out-of-Distribution Electricity Load Forecasting during + COVID-19: A Continual Learning Approach Leveraging Human Mobility + + +
+ In traditional deep learning algorithms, one of the key assumptions is that +the data distribution remains constant during both training and deployment. +However, this assumption becomes problematic when faced with +Out-of-Distribution periods, such as the COVID-19 lockdowns, where the data +distribution significantly deviates from what the model has seen during +training. This paper employs a two-fold strategy: utilizing continual learning +techniques to update models with new data and harnessing human mobility data +collected from privacy-preserving pedestrian counters located outside +buildings. In contrast to online learning, which suffers from 'catastrophic +forgetting' as newly acquired knowledge often erases prior information, +continual learning offers a holistic approach by preserving past insights while +integrating new data. This research applies FSNet, a powerful continual +learning algorithm, to real-world data from 13 building complexes in Melbourne, +Australia, a city which had the second longest total lockdown duration globally +during the pandemic. Results underscore the crucial role of continual learning +in accurate energy forecasting, particularly during Out-of-Distribution +periods. Secondary data such as mobility and temperature provided ancillary +support to the primary forecasting model. More importantly, while traditional +methods struggled to adapt during lockdowns, models featuring at least online +learning demonstrated resilience, with lockdown periods posing fewer challenges +once armed with adaptive learning techniques. This study contributes valuable +methodologies and insights to the ongoing effort to improve energy load +forecasting during future Out-of-Distribution periods. + +
+
+ comment: 10 pages, 2 figures, 5 tables, BuildSys '23 +
+
+
+
+
+ + ♻ ☆ RescueSpeech: A German Corpus for Speech Recognition in Search and + Rescue Domain + + +
+ Despite the recent advancements in speech recognition, there are still +difficulties in accurately transcribing conversational and emotional speech in +noisy and reverberant acoustic environments. This poses a particular challenge +in the search and rescue (SAR) domain, where transcribing conversations among +rescue team members is crucial to support real-time decision-making. The +scarcity of speech data and associated background noise in SAR scenarios make +it difficult to deploy robust speech recognition systems. To address this +issue, we have created and made publicly available a German speech dataset +called RescueSpeech. This dataset includes real speech recordings from +simulated rescue exercises. Additionally, we have released competitive training +recipes and pre-trained models. Our study highlights that the performance +attained by state-of-the-art methods in this challenging scenario is still far +from reaching an acceptable level. + +
+
+
+
+
+ + ♻ ☆ ROSCOE: A Suite of Metrics for Scoring Step-by-Step Reasoning + + +
+ Large language models show improved downstream task performance when prompted +to generate step-by-step reasoning to justify their final answers. These +reasoning steps greatly improve model interpretability and verification, but +objectively studying their correctness (independent of the final answer) is +difficult without reliable methods for automatic evaluation. We simply do not +know how often the stated reasoning steps actually support the final end task +predictions. In this work, we present ROSCOE, a suite of interpretable, +unsupervised automatic scores that improve and extend previous text generation +evaluation metrics. To evaluate ROSCOE against baseline metrics, we design a +typology of reasoning errors and collect synthetic and human evaluation scores +on commonly used reasoning datasets. In contrast with existing metrics, ROSCOE +can measure semantic consistency, logicality, informativeness, fluency, and +factuality - among other traits - by leveraging properties of step-by-step +rationales. We empirically verify the strength of our metrics on five human +annotated and six programmatically perturbed diagnostics datasets - covering a +diverse set of tasks that require reasoning skills and show that ROSCOE can +consistently outperform baseline metrics. + +
+
+
+
+
+ + ♻ ☆ Shape-conditioned 3D Molecule Generation via Equivariant Diffusion + Models + + +
+ Ligand-based drug design aims to identify novel drug candidates of similar +shapes with known active molecules. In this paper, we formulated an in silico +shape-conditioned molecule generation problem to generate 3D molecule +structures conditioned on the shape of a given molecule. To address this +problem, we developed a translation- and rotation-equivariant shape-guided +generative model ShapeMol. ShapeMol consists of an equivariant shape encoder +that maps molecular surface shapes into latent embeddings, and an equivariant +diffusion model that generates 3D molecules based on these embeddings. +Experimental results show that ShapeMol can generate novel, diverse, drug-like +molecules that retain 3D molecular shapes similar to the given shape condition. +These results demonstrate the potential of ShapeMol in designing drug +candidates of desired 3D shapes binding to protein target pockets. + +
+
+
+
+
+ + ♻ ☆ Fidelity of Interpretability Methods and Perturbation Artifacts in + Neural Networks + + +
+ Despite excellent performance of deep neural networks (DNNs) in image +classification, detection, and prediction, characterizing how DNNs make a given +decision remains an open problem, resulting in a number of interpretability +methods. Post-hoc interpretability methods primarily aim to quantify the +importance of input features with respect to the class probabilities. However, +due to the lack of ground truth and the existence of interpretability methods +with diverse operating characteristics, evaluating these methods is a crucial +challenge. A popular approach to evaluate interpretability methods is to +perturb input features deemed important for a given prediction and observe the +decrease in accuracy. However, perturbation itself may introduce artifacts. We +propose a method for estimating the impact of such artifacts on the fidelity +estimation by utilizing model accuracy curves from perturbing input features +according to the Most Import First (MIF) and Least Import First (LIF) orders. +Using the ResNet-50 trained on the ImageNet, we demonstrate the proposed +fidelity estimation of four popular post-hoc interpretability methods. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Graph Barlow Twins: A self-supervised representation learning framework + for graphs + + +
+ The self-supervised learning (SSL) paradigm is an essential exploration area, +which tries to eliminate the need for expensive data labeling. Despite the +great success of SSL methods in computer vision and natural language +processing, most of them employ contrastive learning objectives that require +negative samples, which are hard to define. This becomes even more challenging +in the case of graphs and is a bottleneck for achieving robust representations. +To overcome such limitations, we propose a framework for self-supervised graph +representation learning - Graph Barlow Twins, which utilizes a +cross-correlation-based loss function instead of negative samples. Moreover, it +does not rely on non-symmetric neural network architectures - in contrast to +state-of-the-art self-supervised graph representation learning method BGRL. We +show that our method achieves as competitive results as the best +self-supervised methods and fully supervised ones while requiring fewer +hyperparameters and substantially shorter computation time (ca. 30 times faster +than BGRL). + +
+
+
+
+
+ + ♻ ☆ Fairness and robustness in anti-causal prediction + + +
+ Robustness to distribution shift and fairness have independently emerged as +two important desiderata required of modern machine learning models. While +these two desiderata seem related, the connection between them is often unclear +in practice. Here, we discuss these connections through a causal lens, focusing +on anti-causal prediction tasks, where the input to a classifier (e.g., an +image) is assumed to be generated as a function of the target label and the +protected attribute. By taking this perspective, we draw explicit connections +between a common fairness criterion - separation - and a common notion of +robustness - risk invariance. These connections provide new motivation for +applying the separation criterion in anticausal settings, and inform old +discussions regarding fairness-performance tradeoffs. In addition, our findings +suggest that robustness-motivated approaches can be used to enforce separation, +and that they often work better in practice than methods designed to directly +enforce separation. Using a medical dataset, we empirically validate our +findings on the task of detecting pneumonia from X-rays, in a setting where +differences in prevalence across sex groups motivates a fairness mitigation. +Our findings highlight the importance of considering causal structure when +choosing and enforcing fairness criteria. + +
+
+
+
+
+ + ♻ ☆ A prediction and behavioural analysis of machine learning methods for + modelling travel mode choice + + +
+ The emergence of a variety of Machine Learning (ML) approaches for travel +mode choice prediction poses an interesting question to transport modellers: +which models should be used for which applications? The answer to this question +goes beyond simple predictive performance, and is instead a balance of many +factors, including behavioural interpretability and explainability, +computational complexity, and data efficiency. There is a growing body of +research which attempts to compare the predictive performance of different ML +classifiers with classical random utility models. However, existing studies +typically analyse only the disaggregate predictive performance, ignoring other +aspects affecting model choice. Furthermore, many studies are affected by +technical limitations, such as the use of inappropriate validation schemes, +incorrect sampling for hierarchical data, lack of external validation, and the +exclusive use of discrete metrics. We address these limitations by conducting a +systematic comparison of different modelling approaches, across multiple +modelling problems, in terms of the key factors likely to affect model choice +(out-of-sample predictive performance, accuracy of predicted market shares, +extraction of behavioural indicators, and computational efficiency). We combine +several real world datasets with synthetic datasets, where the data generation +function is known. The results indicate that the models with the highest +disaggregate predictive performance (namely extreme gradient boosting and +random forests) provide poorer estimates of behavioural indicators and +aggregate mode shares, and are more expensive to estimate, than other models, +including deep neural networks and Multinomial Logit (MNL). It is further +observed that the MNL model performs robustly in a variety of situations, +though ML techniques can improve the estimates of behavioural indices such as +Willingness to Pay. + +
+
+ comment: 44 pages and 13 figures +
+
+
+
+
+ + ♻ ☆ PSO-Convolutional Neural Networks with Heterogeneous Learning Rate + + +
+ Convolutional Neural Networks (ConvNets or CNNs) have been candidly deployed +in the scope of computer vision and related fields. Nevertheless, the dynamics +of training of these neural networks lie still elusive: it is hard and +computationally expensive to train them. A myriad of architectures and training +strategies have been proposed to overcome this challenge and address several +problems in image processing such as speech, image and action recognition as +well as object detection. In this article, we propose a novel Particle Swarm +Optimization (PSO) based training for ConvNets. In such framework, the vector +of weights of each ConvNet is typically cast as the position of a particle in +phase space whereby PSO collaborative dynamics intertwines with Stochastic +Gradient Descent (SGD) in order to boost training performance and +generalization. Our approach goes as follows: i) [regular phase] each ConvNet +is trained independently via SGD; ii) [collaborative phase] ConvNets share +among themselves their current vector of weights (or particle-position) along +with their gradient estimates of the Loss function. Distinct step sizes are +coined by distinct ConvNets. By properly blending ConvNets with large (possibly +random) step-sizes along with more conservative ones, we propose an algorithm +with competitive performance with respect to other PSO-based approaches on +Cifar-10 and Cifar-100 (accuracy of 98.31% and 87.48%). These accuracy levels +are obtained by resorting to only four ConvNets -- such results are expected to +scale with the number of collaborative ConvNets accordingly. We make our source +codes available for download https://github.com/leonlha/PSO-ConvNet-Dynamics. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ♻ ☆ BOLT: An Automated Deep Learning Framework for Training and Deploying + Large-Scale Search and Recommendation Models on Commodity CPU Hardware CIKM 2023 + + +
+ Efficient large-scale neural network training and inference on commodity CPU +hardware is of immense practical significance in democratizing deep learning +(DL) capabilities. Presently, the process of training massive models consisting +of hundreds of millions to billions of parameters requires the extensive use of +specialized hardware accelerators, such as GPUs, which are only accessible to a +limited number of institutions with considerable financial resources. Moreover, +there is often an alarming carbon footprint associated with training and +deploying these models. In this paper, we take a step towards addressing these +challenges by introducing BOLT, a sparse deep learning library for training +large-scale search and recommendation models on standard CPU hardware. BOLT +provides a flexible, high-level API for constructing models that will be +familiar to users of existing popular DL frameworks. By automatically tuning +specialized hyperparameters, BOLT also abstracts away the algorithmic details +of sparse network training. We evaluate BOLT on a number of information +retrieval tasks including product recommendations, text classification, graph +neural networks, and personalization. We find that our proposed system achieves +competitive performance with state-of-the-art techniques at a fraction of the +cost and energy consumption and an order-of-magnitude faster inference time. +BOLT has also been successfully deployed by multiple businesses to address +critical problems, and we highlight one customer case study in the field of +e-commerce. + +
+
+ comment: 6 pages, 5 tables, 3 figures. CIKM 2023 (Applied Research Track) +
+
+
+
+
+ + ♻ ☆ Hierarchical Optimization-Derived Learning + + +
+ In recent years, by utilizing optimization techniques to formulate the +propagation of deep model, a variety of so-called Optimization-Derived Learning +(ODL) approaches have been proposed to address diverse learning and vision +tasks. Although having achieved relatively satisfying practical performance, +there still exist fundamental issues in existing ODL methods. In particular, +current ODL methods tend to consider model construction and learning as two +separate phases, and thus fail to formulate their underlying coupling and +depending relationship. In this work, we first establish a new framework, named +Hierarchical ODL (HODL), to simultaneously investigate the intrinsic behaviors +of optimization-derived model construction and its corresponding learning +process. Then we rigorously prove the joint convergence of these two sub-tasks, +from the perspectives of both approximation quality and stationary analysis. To +our best knowledge, this is the first theoretical guarantee for these two +coupled ODL components: optimization and learning. We further demonstrate the +flexibility of our framework by applying HODL to challenging learning tasks, +which have not been properly addressed by existing ODL methods. Finally, we +conduct extensive experiments on both synthetic data and real applications in +vision and other learning tasks to verify the theoretical properties and +practical performance of HODL in various application scenarios. + +
+
+ comment: Accepted by IEEE TPAMI, 16 pages +
+
+
+
+
+ + ♻ ☆ Semantic-Guided Generative Image Augmentation Method with Diffusion + Models for Image Classification + + +
+ Existing image augmentation methods consist of two categories: +perturbation-based methods and generative methods. Perturbation-based methods +apply pre-defined perturbations to augment an original image, but only locally +vary the image, thus lacking image diversity. In contrast, generative methods +bring more image diversity in the augmented images but may not preserve +semantic consistency, thus incorrectly changing the essential semantics of the +original image. To balance image diversity and semantic consistency in +augmented images, we propose SGID, a Semantic-guided Generative Image +augmentation method with Diffusion models for image classification. +Specifically, SGID employs diffusion models to generate augmented images with +good image diversity. More importantly, SGID takes image labels and captions as +guidance to maintain semantic consistency between the augmented and original +images. Experimental results show that SGID outperforms the best augmentation +baseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and +0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image +augmentation baselines and further improves the overall performance. We +demonstrate the semantic consistency and image diversity of SGID through +quantitative human and automated evaluations, as well as qualitative case +studies. + +
+
+ comment: 17 pages, 13 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Breaking On-device Training Memory Wall: A Systematic Survey + + +
+ On-device training has become an increasingly popular approach to machine +learning, enabling models to be trained directly on mobile and edge devices. +However, a major challenge in this area is the limited memory available on +these devices, which can severely restrict the size and complexity of the +models that can be trained. In this systematic survey, we aim to explore the +current state-of-the-art techniques for breaking on-device training memory +walls, focusing on methods that can enable larger and more complex models to be +trained on resource-constrained devices. + Specifically, we first analyze the key factors that contribute to the +phenomenon of memory walls encountered during on-device training. Then, we +present a comprehensive literature review of on-device training, which +addresses the issue of memory limitations. Finally, we summarize on-device +training and highlight the open problems for future research. + By providing a comprehensive overview of these techniques and their +effectiveness in breaking memory walls, we hope to help researchers and +practitioners in this field navigate the rapidly evolving landscape of +on-device training. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Extracting Diagnosis Pathways from Electronic Health Records Using Deep + Reinforcement Learning + + +
+ Clinical diagnosis guidelines aim at specifying the steps that may lead to a +diagnosis. Inspired by guidelines, we aim to learn the optimal sequence of +actions to perform in order to obtain a correct diagnosis from electronic +health records. We apply various deep reinforcement learning algorithms to this +task and experiment on a synthetic but realistic dataset to differentially +diagnose anemia and its subtypes and particularly evaluate the robustness of +various approaches to noise and missing data. Experimental results show that +the deep reinforcement learning algorithms show competitive performance +compared to the state-of-the-art methods with the added advantage that they +enable the progressive generation of a pathway to the suggested diagnosis, +which can both guide and explain the decision process. + +
+
+
+
+
+ + ♻ ☆ Safe Reinforcement Learning for Strategic Bidding of Virtual Power + Plants in Day-Ahead Markets + + +
+ This paper presents a novel safe reinforcement learning algorithm for +strategic bidding of Virtual Power Plants (VPPs) in day-ahead electricity +markets. The proposed algorithm utilizes the Deep Deterministic Policy Gradient +(DDPG) method to learn competitive bidding policies without requiring an +accurate market model. Furthermore, to account for the complex internal +physical constraints of VPPs we introduce two enhancements to the DDPG method. +Firstly, a projection-based safety shield that restricts the agent's actions to +the feasible space defined by the non-linear power flow equations and operating +constraints of distributed energy resources is derived. Secondly, a penalty for +the shield activation in the reward function that incentivizes the agent to +learn a safer policy is introduced. A case study based on the IEEE 13-bus +network demonstrates the effectiveness of the proposed approach in enabling the +agent to learn a highly competitive, safe strategic policy. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Code: Security Hardening and Adversarial + Testing + + +
+ Large language models (large LMs) are increasingly trained on massive +codebases and used to generate code. However, LMs lack awareness of security +and are found to frequently produce unsafe code. This work studies the security +of LMs along two important axes: (i) security hardening, which aims to enhance +LMs' reliability in generating secure code, and (ii) adversarial testing, which +seeks to evaluate LMs' security at an adversarial standpoint. We address both +of these by formulating a new security task called controlled code generation. +The task is parametric and takes as input a binary property to guide the LM to +generate secure or unsafe code, while preserving the LM's capability of +generating functionally correct code. We propose a novel learning-based +approach called SVEN to solve this task. SVEN leverages property-specific +continuous vectors to guide program generation towards the given property, +without modifying the LM's weights. Our training procedure optimizes these +continuous vectors by enforcing specialized loss terms on different regions of +code, using a high-quality dataset carefully curated by us. Our extensive +evaluation shows that SVEN is highly effective in achieving strong security +control. For instance, a state-of-the-art CodeGen LM with 2.7B parameters +generates secure code for 59.1% of the time. When we employ SVEN to perform +security hardening (or adversarial testing) on this LM, the ratio is +significantly boosted to 92.3% (or degraded to 36.8%). Importantly, SVEN +closely matches the original LMs in functional correctness. + +
+
+
+
+
+ + ♻ ☆ Task-Oriented Communication for Multi-Device Cooperative Edge Inference + + +
+ This paper investigates task-oriented communication for multi-device +cooperative edge inference, where a group of distributed low-end edge devices +transmit the extracted features of local samples to a powerful edge server for +inference. While cooperative edge inference can overcome the limited sensing +capability of a single device, it substantially increases the communication +overhead and may incur excessive latency. To enable low-latency cooperative +inference, we propose a learning-based communication scheme that optimizes +local feature extraction and distributed feature encoding in a task-oriented +manner, i.e., to remove data redundancy and transmit information that is +essential for the downstream inference task rather than reconstructing the data +samples at the edge server. Specifically, we leverage an information bottleneck +(IB) principle to extract the task-relevant feature at each edge device and +adopt a distributed information bottleneck (DIB) framework to formalize a +single-letter characterization of the optimal rate-relevance tradeoff for +distributed feature encoding. To admit flexible control of the communication +overhead, we extend the DIB framework to a distributed deterministic +information bottleneck (DDIB) objective that explicitly incorporates the +representational costs of the encoded features. As the IB-based objectives are +computationally prohibitive for high-dimensional data, we adopt variational +approximations to make the optimization problems tractable. To compensate the +potential performance loss due to the variational approximations, we also +develop a selective retransmission (SR) mechanism to identify the redundancy in +the encoded features of multiple edge devices to attain additional +communication overhead reduction. Extensive experiments evidence that the +proposed task-oriented communication scheme achieves a better rate-relevance +tradeoff than baseline methods. + +
+
+ comment: This paper was accepted to IEEE Transactions on Wireless + Communication +
+
+
+
+
+ + ♻ ☆ GPT Can Solve Mathematical Problems Without a Calculator + + +
+ Previous studies have typically assumed that large language models are unable +to accurately perform arithmetic operations, particularly multiplication of >8 +digits, and operations involving decimals and fractions, without the use of +calculator tools. This paper aims to challenge this misconception. With +sufficient training data, a 2 billion-parameter language model can accurately +perform multi-digit arithmetic operations with almost 100% accuracy without +data leakage, significantly surpassing GPT-4 (whose multi-digit multiplication +accuracy is only 4.3%). We also demonstrate that our MathGLM, fine-tuned from +GLM-10B on a dataset with additional multi-step arithmetic operations and math +problems described in text, achieves similar performance to GPT-4 on a +5,000-samples Chinese math problem test set. Our code and data are public at +https://github.com/THUDM/MathGLM. + +
+
+ comment: 26pages,14figures +
+
+
+
+
+ + ♻ ☆ Knowledge Distillation-Empowered Digital Twin for Anomaly Detection + + +
+ Cyber-physical systems (CPSs), like train control and management systems +(TCMS), are becoming ubiquitous in critical infrastructures. As safety-critical +systems, ensuring their dependability during operation is crucial. Digital +twins (DTs) have been increasingly studied for this purpose owing to their +capability of runtime monitoring and warning, prediction and detection of +anomalies, etc. However, constructing a DT for anomaly detection in TCMS +necessitates sufficient training data and extracting both chronological and +context features with high quality. Hence, in this paper, we propose a novel +method named KDDT for TCMS anomaly detection. KDDT harnesses a language model +(LM) and a long short-term memory (LSTM) network to extract contexts and +chronological features, respectively. To enrich data volume, KDDT benefits from +out-of-domain data with knowledge distillation (KD). We evaluated KDDT with two +datasets from our industry partner Alstom and obtained the F1 scores of 0.931 +and 0.915, respectively, demonstrating the effectiveness of KDDT. We also +explored individual contributions of the DT model, LM, and KD to the overall +performance of KDDT, via a comprehensive empirical study, and observed average +F1 score improvements of 12.4%, 3%, and 6.05%, respectively. + +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Contrastive Self-Supervised Learning for POI-level Crowd + Flow Inference KDD + + +
+ Accurate acquisition of crowd flow at Points of Interest (POIs) is pivotal +for effective traffic management, public service, and urban planning. Despite +this importance, due to the limitations of urban sensing techniques, the data +quality from most sources is inadequate for monitoring crowd flow at each POI. +This renders the inference of accurate crowd flow from low-quality data a +critical and challenging task. The complexity is heightened by three key +factors: 1) The scarcity and rarity of labeled data, 2) The intricate +spatio-temporal dependencies among POIs, and 3) The myriad correlations between +precise crowd flow and GPS reports. + To address these challenges, we recast the crowd flow inference problem as a +self-supervised attributed graph representation learning task and introduce a +novel Contrastive Self-learning framework for Spatio-Temporal data (CSST). Our +approach initiates with the construction of a spatial adjacency graph founded +on the POIs and their respective distances. We then employ a contrastive +learning technique to exploit large volumes of unlabeled spatio-temporal data. +We adopt a swapped prediction approach to anticipate the representation of the +target subgraph from similar instances. Following the pre-training phase, the +model is fine-tuned with accurate crowd flow data. Our experiments, conducted +on two real-world datasets, demonstrate that the CSST pre-trained on extensive +noisy data consistently outperforms models trained from scratch. + +
+
+ comment: 18 pages; submitted to TKDD; +
+
+
+
+
+ + ♻ ☆ Deep-OSG: Deep Learning of Operators in Semigroup + + +
+ This paper proposes a novel deep learning approach for learning operators in +semigroup, with applications to modeling unknown autonomous dynamical systems +using time series data collected at varied time lags. It is a sequel to the +previous flow map learning (FML) works [T. Qin, K. Wu, and D. Xiu, J. Comput. +Phys., 395:620--635, 2019], [K. Wu and D. Xiu, J. Comput. Phys., 408:109307, +2020], and [Z. Chen, V. Churchill, K. Wu, and D. Xiu, J. Comput. Phys., +449:110782, 2022], which focused on learning single evolution operator with a +fixed time step. This paper aims to learn a family of evolution operators with +variable time steps, which constitute a semigroup for an autonomous system. The +semigroup property is very crucial and links the system's evolutionary +behaviors across varying time scales, but it was not considered in the previous +works. We propose for the first time a framework of embedding the semigroup +property into the data-driven learning process, through a novel neural network +architecture and new loss functions. The framework is very feasible, can be +combined with any suitable neural networks, and is applicable to learning +general autonomous ODEs and PDEs. We present the rigorous error estimates and +variance analysis to understand the prediction accuracy and robustness of our +approach, showing the remarkable advantages of semigroup awareness in our +model. Moreover, our approach allows one to arbitrarily choose the time steps +for prediction and ensures that the predicted results are well self-matched and +consistent. Extensive numerical experiments demonstrate that embedding the +semigroup property notably reduces the data dependency of deep learning models +and greatly improves the accuracy, robustness, and stability for long-time +prediction. + +
+
+
+
+
+ + ♻ ☆ Computationally Efficient Reinforcement Learning: Targeted Exploration + leveraging Simple Rules + + +
+ Model-free Reinforcement Learning (RL) generally suffers from poor sample +complexity, mostly due to the need to exhaustively explore the state-action +space to find well-performing policies. On the other hand, we postulate that +expert knowledge of the system often allows us to design simple rules we expect +good policies to follow at all times. In this work, we hence propose a simple +yet effective modification of continuous actor-critic frameworks to incorporate +such rules and avoid regions of the state-action space that are known to be +suboptimal, thereby significantly accelerating the convergence of RL agents. +Concretely, we saturate the actions chosen by the agent if they do not comply +with our intuition and, critically, modify the gradient update step of the +policy to ensure the learning process is not affected by the saturation step. +On a room temperature control case study, it allows agents to converge to +well-performing policies up to 6-7x faster than classical agents without +computational overhead and while retaining good final performance. + +
+
+ comment: Accepted to CDC 2023 +
+
+
+
+
+ + ♻ ☆ MedShapeNet -- A Large-Scale Dataset of 3D Medical Shapes for Computer + Vision + + +
+ We present MedShapeNet, a large collection of anatomical shapes (e.g., bones, +organs, vessels) and 3D surgical instrument models. Prior to the deep learning +era, the broad application of statistical shape models (SSMs) in medical image +analysis is evidence that shapes have been commonly used to describe medical +data. Nowadays, however, state-of-the-art (SOTA) deep learning algorithms in +medical imaging are predominantly voxel-based. In computer vision, on the +contrary, shapes (including, voxel occupancy grids, meshes, point clouds and +implicit surface models) are preferred data representations in 3D, as seen from +the numerous shape-related publications in premier vision conferences, such as +the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), as +well as the increasing popularity of ShapeNet (about 51,300 models) and +Princeton ModelNet (127,915 models) in computer vision research. MedShapeNet is +created as an alternative to these commonly used shape benchmarks to facilitate +the translation of data-driven vision algorithms to medical applications, and +it extends the opportunities to adapt SOTA vision algorithms to solve critical +medical problems. Besides, the majority of the medical shapes in MedShapeNet +are modeled directly on the imaging data of real patients, and therefore it +complements well existing shape benchmarks comprising of computer-aided design +(CAD) models. MedShapeNet currently includes more than 100,000 medical shapes, +and provides annotations in the form of paired data. It is therefore also a +freely available repository of 3D models for extended reality (virtual reality +- VR, augmented reality - AR, mixed reality - MR) and medical 3D printing. This +white paper describes in detail the motivations behind MedShapeNet, the shape +acquisition procedures, the use cases, as well as the usage of the online shape +search portal: https://medshapenet.ikim.nrw/ + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ StyleDomain: Efficient and Lightweight Parameterizations of StyleGAN for + One-shot and Few-shot Domain Adaptation ICCV 2023 + + +
+ Domain adaptation of GANs is a problem of fine-tuning GAN models pretrained +on a large dataset (e.g. StyleGAN) to a specific domain with few samples (e.g. +painting faces, sketches, etc.). While there are many methods that tackle this +problem in different ways, there are still many important questions that remain +unanswered. In this paper, we provide a systematic and in-depth analysis of the +domain adaptation problem of GANs, focusing on the StyleGAN model. We perform a +detailed exploration of the most important parts of StyleGAN that are +responsible for adapting the generator to a new domain depending on the +similarity between the source and target domains. As a result of this study, we +propose new efficient and lightweight parameterizations of StyleGAN for domain +adaptation. Particularly, we show that there exist directions in StyleSpace +(StyleDomain directions) that are sufficient for adapting to similar domains. +For dissimilar domains, we propose Affine+ and AffineLight+ parameterizations +that allows us to outperform existing baselines in few-shot adaptation while +having significantly less training parameters. Finally, we examine StyleDomain +directions and discover their many surprising properties that we apply for +domain mixing and cross-domain image morphing. Source code can be found at +https://github.com/AIRI-Institute/StyleDomain. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ JL-lemma derived Optimal Projections for Discriminative Dictionary + Learning + + +
+ To overcome difficulties in classifying large dimensionality data with a +large number of classes, we propose a novel approach called JLSPCADL. This +paper uses the Johnson-Lindenstrauss (JL) Lemma to select the dimensionality of +a transformed space in which a discriminative dictionary can be learned for +signal classification. Rather than reducing dimensionality via random +projections, as is often done with JL, we use a projection transformation +matrix derived from Modified Supervised PC Analysis (M-SPCA) with the +JL-prescribed dimension. + JLSPCADL provides a heuristic to deduce suitable distortion levels and the +corresponding Suitable Description Length (SDL) of dictionary atoms to derive +an optimal feature space and thus the SDL of dictionary atoms for better +classification. Unlike state-of-the-art dimensionality reduction-based +dictionary learning methods, a projection transformation matrix derived in a +single step from M-SPCA provides maximum feature-label consistency of the +transformed space while preserving the cluster structure of the original data. +Despite confusing pairs, the dictionary for the transformed space generates +discriminative sparse coefficients, with fewer training samples. +Experimentation demonstrates that JLSPCADL scales well with an increasing +number of classes and dimensionality. Improved label consistency of features +due to M-SPCA helps to classify better. Further, the complexity of training a +discriminative dictionary is significantly reduced by using SDL. +Experimentation on OCR and face recognition datasets shows relatively better +classification performance than other supervised dictionary learning +algorithms. + +
+
+
+
+
+ + ♻ ☆ Biomedical image analysis competitions: The state of current + participation practice + + +
+ The number of international benchmarking competitions is steadily increasing +in various fields of machine learning (ML) research and practice. So far, +however, little is known about the common practice as well as bottlenecks faced +by the community in tackling the research questions posed. To shed light on the +status quo of algorithm development in the specific field of biomedical imaging +analysis, we designed an international survey that was issued to all +participants of challenges conducted in conjunction with the IEEE ISBI 2021 and +MICCAI 2021 conferences (80 competitions in total). The survey covered +participants' expertise and working environments, their chosen strategies, as +well as algorithm characteristics. A median of 72% challenge participants took +part in the survey. According to our results, knowledge exchange was the +primary incentive (70%) for participation, while the reception of prize money +played only a minor role (16%). While a median of 80 working hours was spent on +method development, a large portion of participants stated that they did not +have enough time for method development (32%). 25% perceived the infrastructure +to be a bottleneck. Overall, 94% of all solutions were deep learning-based. Of +these, 84% were based on standard architectures. 43% of the respondents +reported that the data samples (e.g., images) were too large to be processed at +once. This was most commonly addressed by patch-based training (69%), +downsampling (37%), and solving 3D analysis tasks as a series of 2D tasks. +K-fold cross-validation on the training set was performed by only 37% of the +participants and only 50% of the participants performed ensembling based on +multiple identical models (61%) or heterogeneous models (39%). 48% of the +respondents applied postprocessing steps. + +
+
+
+
+
+ + ♻ ☆ A Survey of Deep Graph Clustering: Taxonomy, Challenge, Application, and + Open Resource + + +
+ Graph clustering, which aims to divide nodes in the graph into several +distinct clusters, is a fundamental yet challenging task. Benefiting from the +powerful representation capability of deep learning, deep graph clustering +methods have achieved great success in recent years. However, the corresponding +survey paper is relatively scarce, and it is imminent to make a summary of this +field. From this motivation, we conduct a comprehensive survey of deep graph +clustering. Firstly, we introduce formulaic definition, evaluation, and +development in this field. Secondly, the taxonomy of deep graph clustering +methods is presented based on four different criteria, including graph type, +network architecture, learning paradigm, and clustering method. Thirdly, we +carefully analyze the existing methods via extensive experiments and summarize +the challenges and opportunities from five perspectives, including graph data +quality, stability, scalability, discriminative capability, and unknown cluster +number. Besides, the applications of deep graph clustering methods in six +domains, including computer vision, natural language processing, recommendation +systems, social network analyses, bioinformatics, and medical science, are +presented. Last but not least, this paper provides open resource supports, +including 1) a collection +(\url{https://github.com/yueliu1999/Awesome-Deep-Graph-Clustering}) of +state-of-the-art deep graph clustering methods (papers, codes, and datasets) +and 2) a unified framework +(\url{https://github.com/Marigoldwu/A-Unified-Framework-for-Deep-Attribute-Graph-Clustering}) +of deep graph clustering. We hope this work can serve as a quick guide and help +researchers overcome challenges in this vibrant field. + +
+
+ comment: 20 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Constrained Domain Generalization for Medical Image + Classification + + +
+ Deep neural networks (DNN) have demonstrated unprecedented success for +medical imaging applications. However, due to the issue of limited dataset +availability and the strict legal and ethical requirements for patient privacy +protection, the broad applications of medical imaging classification driven by +DNN with large-scale training data have been largely hindered. For example, +when training the DNN from one domain (e.g., with data only from one hospital), +the generalization capability to another domain (e.g., data from another +hospital) could be largely lacking. In this paper, we aim to tackle this +problem by developing the privacy-preserving constrained domain generalization +method, aiming to improve the generalization capability under the +privacy-preserving condition. In particular, We propose to improve the +information aggregation process on the centralized server-side with a novel +gradient alignment loss, expecting that the trained model can be better +generalized to the "unseen" but related medical images. The rationale and +effectiveness of our proposed method can be explained by connecting our +proposed method with the Maximum Mean Discrepancy (MMD) which has been widely +adopted as the distribution distance measurement. Experimental results on two +challenging medical imaging classification tasks indicate that our method can +achieve better cross-domain generalization capability compared to the +state-of-the-art federated learning methods. + +
+
+
+
+
+ + ♻ ☆ Adapt Your Teacher: Improving Knowledge Distillation for Exemplar-free + Continual Learning + + +
+ In this work, we investigate exemplar-free class incremental learning (CIL) +with knowledge distillation (KD) as a regularization strategy, aiming to +prevent forgetting. KD-based methods are successfully used in CIL, but they +often struggle to regularize the model without access to exemplars of the +training data from previous tasks. Our analysis reveals that this issue +originates from substantial representation shifts in the teacher network when +dealing with out-of-distribution data. This causes large errors in the KD loss +component, leading to performance degradation in CIL models. Inspired by recent +test-time adaptation methods, we introduce Teacher Adaptation (TA), a method +that concurrently updates the teacher and the main models during incremental +training. Our method seamlessly integrates with KD-based CIL approaches and +allows for consistent enhancement of their performance across multiple +exemplar-free CIL benchmarks. + +
+
+
+
+
+ + ♻ ☆ TwinLiteNet: An Efficient and Lightweight Model for Driveable Area and + Lane Segmentation in Self-Driving Cars + + +
+ Semantic segmentation is a common task in autonomous driving to understand +the surrounding environment. Driveable Area Segmentation and Lane Detection are +particularly important for safe and efficient navigation on the road. However, +original semantic segmentation models are computationally expensive and require +high-end hardware, which is not feasible for embedded systems in autonomous +vehicles. This paper proposes a lightweight model for the driveable area and +lane line segmentation. TwinLiteNet is designed cheaply but achieves accurate +and efficient segmentation results. We evaluate TwinLiteNet on the BDD100K +dataset and compare it with modern models. Experimental results show that our +TwinLiteNet performs similarly to existing approaches, requiring significantly +fewer computational resources. Specifically, TwinLiteNet achieves a mIoU score +of 91.3% for the Drivable Area task and 31.08% IoU for the Lane Detection task +with only 0.4 million parameters and achieves 415 FPS on GPU RTX A5000. +Furthermore, TwinLiteNet can run in real-time on embedded devices with limited +computing power, especially since it achieves 60FPS on Jetson Xavier NX, making +it an ideal solution for self-driving vehicles. Code is available: +url{https://github.com/chequanghuy/TwinLiteNet}. + +
+
+ comment: Accepted by MAPR 2023 +
+
+
+
+
+ + ♻ ☆ Pure Exploration in Bandits with Linear Constraints + + +
+ We address the problem of identifying the optimal policy with a fixed +confidence level in a multi-armed bandit setup, when \emph{the arms are subject +to linear constraints}. Unlike the standard best-arm identification problem +which is well studied, the optimal policy in this case may not be deterministic +and could mix between several arms. This changes the geometry of the problem +which we characterize via an information-theoretic lower bound. We introduce +two asymptotically optimal algorithms for this setting, one based on the +Track-and-Stop method and the other based on a game-theoretic approach. Both +these algorithms try to track an optimal allocation based on the lower bound +and computed by a weighted projection onto the boundary of a normal cone. +Finally, we provide empirical results that validate our bounds and visualize +how constraints change the hardness of the problem. + +
+
+ comment: EWRL16 +
+
+
+
+
+ + ♻ ☆ GEDI: A Graph-based End-to-end Data Imputation Framework + + +
+ Data imputation is an effective way to handle missing data, which is common +in practical applications. In this study, we propose and test a novel data +imputation process that achieve two important goals: (1) preserve the row-wise +similarities among observations and column-wise contextual relationships among +features in the feature matrix, and (2) tailor the imputation process to +specific downstream label prediction task. The proposed imputation process uses +Transformer network and graph structure learning to iteratively refine the +contextual relationships among features and similarities among observations. +Moreover, it uses a meta-learning framework to select features that are +influential to the downstream prediction task of interest. We conduct +experiments on real-world large data sets, and show that the proposed +imputation process consistently improves imputation and label prediction +performance over a variety of benchmark methods. + +
+
+
+
+
+ + ♻ ☆ A Call to Reflect on Evaluation Practices for Age Estimation: + Comparative Analysis of the State-of-the-Art and a Unified Benchmark + + +
+ Comparing different age estimation methods poses a challenge due to the +unreliability of published results stemming from inconsistencies in the +benchmarking process. Previous studies have reported continuous performance +improvements over the past decade using specialized methods; however, our +findings challenge these claims. This paper identifies two trivial, yet +persistent issues with the currently used evaluation protocol and describes how +to resolve them. We describe our evaluation protocol in detail and provide +specific examples of how the protocol should be used. We utilize the protocol +to offer an extensive comparative analysis for state-of-the-art facial age +estimation methods. Surprisingly, we find that the performance differences +between the methods are negligible compared to the effect of other factors, +such as facial alignment, facial coverage, image resolution, model +architecture, or the amount of data used for pretraining. We use the gained +insights to propose using FaRL as the backbone model and demonstrate its +efficiency. The results emphasize the importance of consistent data +preprocessing practices for reliable and meaningful comparisons. We make our +source code public at +https://github.com/paplhjak/Facial-Age-Estimation-Benchmark. + +
+
+ comment: Revised version +
+
+
+
+
+ + ♻ ☆ Multi-scale Wasserstein Shortest-path Graph Kernels for Graph + Classification + + +
+ Graph kernels are conventional methods for computing graph similarities. +However, most of the R-convolution graph kernels face two challenges: 1) They +cannot compare graphs at multiple different scales, and 2) they do not consider +the distributions of substructures when computing the kernel matrix. These two +challenges limit their performances. To mitigate the two challenges, we propose +a novel graph kernel called the Multi-scale Wasserstein Shortest-Path graph +kernel (MWSP), at the heart of which is the multi-scale shortest-path node +feature map, of which each element denotes the number of occurrences of a +shortest path around a node. A shortest path is represented by the +concatenation of all the labels of nodes in it. Since the shortest-path node +feature map can only compare graphs at local scales, we incorporate into it the +multiple different scales of the graph structure, which are captured by the +truncated BFS trees of different depths rooted at each node in a graph. We use +the Wasserstein distance to compute the similarity between the multi-scale +shortest-path node feature maps of two graphs, considering the distributions of +shortest paths. We empirically validate MWSP on various benchmark graph +datasets and demonstrate that it achieves state-of-the-art performance on most +datasets. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Empirical and Instance-Dependent Estimation of Markov Chain and Mixing + Time + + +
+ We address the problem of estimating the mixing time of a Markov chain from a +single trajectory of observations. Unlike most previous works which employed +Hilbert space methods to estimate spectral gaps, we opt for an approach based +on contraction with respect to total variation. Specifically, we estimate the +contraction coefficient introduced in Wolfer [2020], inspired from Dobrushin's. +This quantity, unlike the spectral gap, controls the mixing time up to strong +universal constants and remains applicable to non-reversible chains. We improve +existing fully data-dependent confidence intervals around this contraction +coefficient, which are both easier to compute and thinner than spectral +counterparts. Furthermore, we introduce a novel analysis beyond the worst-case +scenario by leveraging additional information about the transition matrix. This +allows us to derive instance-dependent rates for estimating the matrix with +respect to the induced uniform norm, and some of its mixing properties. + +
+
+
+
+
+ + ♻ ☆ LLaSM: Large Language and Speech Model + + +
+ Multi-modal large language models have garnered significant interest +recently. Though, most of the works focus on vision-language multi-modal models +providing strong capabilities in following vision-and-language instructions. +However, we claim that speech is also an important modality through which +humans interact with the world. Hence, it is crucial for a general-purpose +assistant to be able to follow multi-modal speech-and-language instructions. In +this work, we propose Large Language and Speech Model (LLaSM). LLaSM is an +end-to-end trained large multi-modal speech-language model with cross-modal +conversational abilities, capable of following speech-and-language +instructions. Our early experiments show that LLaSM demonstrates a more +convenient and natural way for humans to interact with artificial intelligence. +Specifically, we also release a large Speech Instruction Following dataset +LLaSM-Audio-Instructions. Code and demo are available at +https://github.com/LinkSoul-AI/LLaSM and +https://huggingface.co/spaces/LinkSoul/LLaSM. The LLaSM-Audio-Instructions +dataset is available at +https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions. + +
+
+
+
+
+ + ♻ ☆ Multiplayer Bandit Learning, from Competition to Cooperation + + +
+ The stochastic multi-armed bandit model captures the tradeoff between +exploration and exploitation. We study the effects of competition and +cooperation on this tradeoff. Suppose there are $k$ arms and two players, Alice +and Bob. In every round, each player pulls an arm, receives the resulting +reward, and observes the choice of the other player but not their reward. +Alice's utility is $\Gamma_A + \lambda \Gamma_B$ (and similarly for Bob), where +$\Gamma_A$ is Alice's total reward and $\lambda \in [-1, 1]$ is a cooperation +parameter. At $\lambda = -1$ the players are competing in a zero-sum game, at +$\lambda = 1$, they are fully cooperating, and at $\lambda = 0$, they are +neutral: each player's utility is their own reward. The model is related to the +economics literature on strategic experimentation, where usually players +observe each other's rewards. + With discount factor $\beta$, the Gittins index reduces the one-player +problem to the comparison between a risky arm, with a prior $\mu$, and a +predictable arm, with success probability $p$. The value of $p$ where the +player is indifferent between the arms is the Gittins index $g = g(\mu,\beta) > +m$, where $m$ is the mean of the risky arm. + We show that competing players explore less than a single player: there is +$p^* \in (m, g)$ so that for all $p > p^*$, the players stay at the predictable +arm. However, the players are not myopic: they still explore for some $p > m$. +On the other hand, cooperating players explore more than a single player. We +also show that neutral players learn from each other, receiving strictly higher +total rewards than they would playing alone, for all $ p\in (p^*, g)$, where +$p^*$ is the threshold from the competing case. + Finally, we show that competing and neutral players eventually settle on the +same arm in every Nash equilibrium, while this can fail for cooperating +players. + +
+
+ comment: Improved version with a few corrections. 57 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ No Train Still Gain. Unleash Mathematical Reasoning of Large Language + Models with Monte Carlo Tree Search Guided by Energy Function + + +
+ Large language models (LLMs) demonstrate impressive language understanding +and contextual learning abilities, making them suitable for natural language +processing (NLP) tasks and complex mathematical reasoning. However, when +applied to mathematical reasoning tasks, LLMs often struggle to generate +correct reasoning steps and answers despite having high probabilities for the +solutions. To overcome this limitation and enhance the mathematical reasoning +capabilities of fine-tuned LLMs without additional fine-tuning steps, we +propose a method that incorporates Monte Carlo Tree Search (MCTS) and a +lightweight energy function to rank decision steps and enable immediate +reaction and precise reasoning. Specifically, we re-formulate the fine-tuned +LLMs into a Residual-based Energy Model (Residual-EBM) and employ noise +contrastive estimation to estimate the energy function's parameters. We then +utilize MCTS with the energy function as a path verifier to search the output +space and evaluate the reasoning path. Through extensive experiments on two +mathematical reasoning benchmarks, GSM8k and AQUA-RAT, we demonstrate the +exceptional capabilities of our method, which significantly improves the pass@1 +metric of the fine-tuned model without requiring additional fine-tuning or +reinforcement learning with human feedback alignment. + +
+
+ comment: still in progress +
+
+
+
+
+ + ♻ ☆ Robot Parkour Learning + + +
+ Parkour is a grand challenge for legged locomotion that requires robots to +overcome various obstacles rapidly in complex environments. Existing methods +can generate either diverse but blind locomotion skills or vision-based but +specialized skills by using reference animal data or complex rewards. However, +autonomous parkour requires robots to learn generalizable skills that are both +vision-based and diverse to perceive and react to various scenarios. In this +work, we propose a system for learning a single end-to-end vision-based parkour +policy of diverse parkour skills using a simple reward without any reference +motion data. We develop a reinforcement learning method inspired by direct +collocation to generate parkour skills, including climbing over high obstacles, +leaping over large gaps, crawling beneath low barriers, squeezing through thin +slits, and running. We distill these skills into a single vision-based parkour +policy and transfer it to a quadrupedal robot using its egocentric depth +camera. We demonstrate that our system can empower two different low-cost +robots to autonomously select and execute appropriate parkour skills to +traverse challenging real-world environments. + +
+
+ comment: CoRL 2023 (Oral). Project website at https://robot-parkour.github.io +
+
+
+
+
+ + ♻ ☆ Multi-Modality Multi-Loss Fusion Network + + +
+ In this work we investigate the optimal selection and fusion of features +across multiple modalities and combine these in a neural network to improve +emotion detection. We compare different fusion methods and examine the impact +of multi-loss training within the multi-modality fusion network, identifying +useful findings relating to subnet performance. Our best model achieves +state-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and +CH-SIMS), and outperforms the other methods in most metrics. We have found that +training on multimodal features improves single modality testing and designing +fusion methods based on dataset annotation schema enhances model performance. +These results suggest a roadmap towards an optimized feature selection and +fusion approach for enhancing emotion detection in neural networks. + +
+
+ comment: First two authors contributed equally to the paper +
+
+
+
+
+ + ♻ ☆ Diffusion on the Probability Simplex + + +
+ Diffusion models learn to reverse the progressive noising of a data +distribution to create a generative model. However, the desired continuous +nature of the noising process can be at odds with discrete data. To deal with +this tension between continuous and discrete objects, we propose a method of +performing diffusion on the probability simplex. Using the probability simplex +naturally creates an interpretation where points correspond to categorical +probability distributions. Our method uses the softmax function applied to an +Ornstein-Unlenbeck Process, a well-known stochastic differential equation. We +find that our methodology also naturally extends to include diffusion on the +unit cube which has applications for bounded image generation. + +
+
+
+
+
+ + ♻ ☆ Multi-granulariy Time-based Transformer for Knowledge Tracing + + +
+ In this paper, we present a transformer architecture for predicting student +performance on standardized tests. Specifically, we leverage students +historical data, including their past test scores, study habits, and other +relevant information, to create a personalized model for each student. We then +use these models to predict their future performance on a given test. Applying +this model to the RIIID dataset, we demonstrate that using multiple +granularities for temporal features as the decoder input significantly improve +model performance. Our results also show the effectiveness of our approach, +with substantial improvements over the LightGBM method. Our work contributes to +the growing field of AI in education, providing a scalable and accurate tool +for predicting student outcomes. + +
+
+
+
+
+ + ♻ ☆ Temporal Dynamic Synchronous Functional Brain Network for Schizophrenia + Diagnosis and Lateralization Analysis + + +
+ The available evidence suggests that dynamic functional connectivity (dFC) +can capture time-varying abnormalities in brain activity in resting-state +cerebral functional magnetic resonance imaging (rs-fMRI) data and has a natural +advantage in uncovering mechanisms of abnormal brain activity in +schizophrenia(SZ) patients. Hence, an advanced dynamic brain network analysis +model called the temporal brain category graph convolutional network +(Temporal-BCGCN) was employed. Firstly, a unique dynamic brain network analysis +module, DSF-BrainNet, was designed to construct dynamic synchronization +features. Subsequently, a revolutionary graph convolution method, TemporalConv, +was proposed, based on the synchronous temporal properties of feature. Finally, +the first modular abnormal hemispherical lateralization test tool in deep +learning based on rs-fMRI data, named CategoryPool, was proposed. This study +was validated on COBRE and UCLA datasets and achieved 83.62% and 89.71% average +accuracies, respectively, outperforming the baseline model and other +state-of-the-art methods. The ablation results also demonstrate the advantages +of TemporalConv over the traditional edge feature graph convolution approach +and the improvement of CategoryPool over the classical graph pooling approach. +Interestingly, this study showed that the lower order perceptual system and +higher order network regions in the left hemisphere are more severely +dysfunctional than in the right hemisphere in SZ and reaffirms the importance +of the left medial superior frontal gyrus in SZ. Our core code is available at: +https://github.com/swfen/Temporal-BCGCN. + +
+
+
+
+
+ + ♻ ☆ A unified framework for dataset shift diagnostics + + +
+ Supervised learning techniques typically assume training data originates from +the target population. Yet, in reality, dataset shift frequently arises, which, +if not adequately taken into account, may decrease the performance of their +predictors. In this work, we propose a novel and flexible framework called +DetectShift that quantifies and tests for multiple dataset shifts, encompassing +shifts in the distributions of $(X, Y)$, $X$, $Y$, $X|Y$, and $Y|X$. +DetectShift equips practitioners with insights into data shifts, facilitating +the adaptation or retraining of predictors using both source and target data. +This proves extremely valuable when labeled samples in the target domain are +limited. The framework utilizes test statistics with the same nature to +quantify the magnitude of the various shifts, making results more +interpretable. It is versatile, suitable for regression and classification +tasks, and accommodates diverse data forms - tabular, text, or image. +Experimental results demonstrate the effectiveness of DetectShift in detecting +dataset shifts even in higher dimensions. + +
+
+
+
+
+ + ♻ ☆ Distributed Out-of-Memory NMF on CPU/GPU Architectures + + +
+ We propose an efficient distributed out-of-memory implementation of the +Non-negative Matrix Factorization (NMF) algorithm for heterogeneous +high-performance-computing (HPC) systems. The proposed implementation is based +on prior work on NMFk, which can perform automatic model selection and extract +latent variables and patterns from data. In this work, we extend NMFk by adding +support for dense and sparse matrix operation on multi-node, multi-GPU systems. +The resulting algorithm is optimized for out-of-memory (OOM) problems where the +memory required to factorize a given matrix is greater than the available GPU +memory. Memory complexity is reduced by batching/tiling strategies, and sparse +and dense matrix operations are significantly accelerated with GPU cores (or +tensor cores when available). Input/Output (I/O) latency associated with batch +copies between host and device is hidden using CUDA streams to overlap data +transfers and compute asynchronously, and latency associated with collective +communications (both intra-node and inter-node) is reduced using optimized +NVIDIA Collective Communication Library NCCL based communicators. Benchmark +results show significant improvement, from 32X to 76x speedup, with the new +implementation using GPUs over the CPU-based NMFk. Good weak scaling was +demonstrated on up to 4096 multi-GPU cluster nodes with approximately 25,000 +GPUs when decomposing a dense 340 Terabyte-size matrix and an 11 Exabyte-size +sparse matrix of density 10e-6. + +
+
+ comment: Accepted at Journal of Supercomputing +
+
+
+
+
+ + ♻ ☆ Data centers with quantum random access memory and quantum networks + + +
+ In this paper, we propose the Quantum Data Center (QDC), an architecture +combining Quantum Random Access Memory (QRAM) and quantum networks. We give a +precise definition of QDC, and discuss its possible realizations and +extensions. We discuss applications of QDC in quantum computation, quantum +communication, and quantum sensing, with a primary focus on QDC for $T$-gate +resources, QDC for multi-party private quantum communication, and QDC for +distributed sensing through data compression. We show that QDC will provide +efficient, private, and fast services as a future version of data centers. + +
+
+ comment: 23 pages, many figures +
+
+
+
+
+ + ♻ ☆ CTRL: Clustering Training Losses for Label Error Detection + + +
+ In supervised machine learning, use of correct labels is extremely important +to ensure high accuracy. Unfortunately, most datasets contain corrupted labels. +Machine learning models trained on such datasets do not generalize well. Thus, +detecting their label errors can significantly increase their efficacy. We +propose a novel framework, called CTRL (Clustering TRaining Losses for label +error detection), to detect label errors in multi-class datasets. It detects +label errors in two steps based on the observation that models learn clean and +noisy labels in different ways. First, we train a neural network using the +noisy training dataset and obtain the loss curve for each sample. Then, we +apply clustering algorithms to the training losses to group samples into two +categories: cleanly-labeled and noisily-labeled. After label error detection, +we remove samples with noisy labels and retrain the model. Our experimental +results demonstrate state-of-the-art error detection accuracy on both image +(CIFAR-10 and CIFAR-100) and tabular datasets under simulated noise. We also +use a theoretical analysis to provide insights into why CTRL performs so well. + +
+
+
+
+
+ + ♻ ☆ Elucidating the Exposure Bias in Diffusion Models + + +
+ Diffusion models have demonstrated impressive generative capabilities, but +their 'exposure bias' problem, described as the input mismatch between training +and sampling, lacks in-depth exploration. In this paper, we systematically +investigate the exposure bias problem in diffusion models by first analytically +modelling the sampling distribution, based on which we then attribute the +prediction error at each sampling step as the root cause of the exposure bias +issue. Furthermore, we discuss potential solutions to this issue and propose an +intuitive metric for it. Along with the elucidation of exposure bias, we +propose a simple, yet effective, training-free method called Epsilon Scaling to +alleviate the exposure bias. We show that Epsilon Scaling explicitly moves the +sampling trajectory closer to the vector field learned in the training phase by +scaling down the network output (Epsilon), mitigating the input mismatch +between training and sampling. Experiments on various diffusion frameworks +(ADM, DDPM/DDIM, EDM, LDM), unconditional and conditional settings, and +deterministic vs. stochastic sampling verify the effectiveness of our method. +The code is available at https://github.com/forever208/ADM-ES; +https://github.com/forever208/EDM-ES + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Learning Energy-Based Models by Cooperative Diffusion Recovery + Likelihood + + +
+ Training energy-based models (EBMs) with maximum likelihood estimation on +high-dimensional data can be both challenging and time-consuming. As a result, +there a noticeable gap in sample quality between EBMs and other generative +frameworks like GANs and diffusion models. To close this gap, inspired by the +recent efforts of learning EBMs by maximimizing diffusion recovery likelihood +(DRL), we propose cooperative diffusion recovery likelihood (CDRL), an +effective approach to tractably learn and sample from a series of EBMs defined +on increasingly noisy versons of a dataset, paired with an initializer model +for each EBM. At each noise level, the initializer model learns to amortize the +sampling process of the EBM, and the two models are jointly estimated within a +cooperative training framework. Samples from the initializer serve as starting +points that are refined by a few sampling steps from the EBM. With the refined +samples, the EBM is optimized by maximizing recovery likelihood, while the +initializer is optimized by learning from the difference between the refined +samples and the initial samples. We develop a new noise schedule and a variance +reduction technique to further improve the sample quality. Combining these +advances, we significantly boost the FID scores compared to existing EBM +methods on CIFAR-10 and ImageNet 32x32, with a 2x speedup over DRL. In +addition, we extend our method to compositional generation and image inpainting +tasks, and showcase the compatibility of CDRL with classifier-free guidance for +conditional generation, achieving similar trade-offs between sample quality and +sample diversity as in diffusion models. + +
+
+
+
+
+ + ♻ ☆ Memory Injections: Correcting Multi-Hop Reasoning Failures during + Inference in Transformer-Based Language Models + + +
+ Answering multi-hop reasoning questions requires retrieving and synthesizing +information from diverse sources. Large Language Models (LLMs) struggle to +perform such reasoning consistently. Here we propose an approach to pinpoint +and rectify multi-hop reasoning failures through targeted memory injections on +LLM attention heads. First, we analyze the per-layer activations of GPT-2 +models in response to single and multi-hop prompts. We then propose a mechanism +that allows users to inject pertinent prompt-specific information, which we +refer to as "memories," at critical LLM locations during inference. By thus +enabling the LLM to incorporate additional relevant information during +inference, we enhance the quality of multi-hop prompt completions. We show +empirically that a simple, efficient, and targeted memory injection into a key +attention layer can often increase the probability of the desired next token in +multi-hop tasks, by up to 424%. + +
+
+
+
+
+ + ♻ ☆ MultiWay-Adapater: Adapting large-scale multi-modal models for scalable + image-text retrieval + + +
+ As the size of Large Multi-Modal Models (LMMs) increases consistently, the +adaptation of these pre-trained models to specialized tasks has become a +computationally and memory-intensive challenge. Traditional fine-tuning methods +require isolated, exhaustive retuning for each new task, limiting the models' +versatility. Moreover, current efficient adaptation techniques often overlook +modality alignment, focusing only on the knowledge extraction of new tasks. To +tackle these issues, we introduce Multiway-Adapter, an innovative framework +incorporating an 'Alignment Enhancer' to deepen modality alignment, enabling +high transferability without tuning pre-trained parameters. Our method adds +fewer than 1.25\% of additional parameters to LMMs, exemplified by the BEiT-3 +model in our study. This leads to superior zero-shot image-text retrieval +performance compared to fully fine-tuned models, while achieving up to a 57\% +reduction in fine-tuning time. Our approach offers a resource-efficient and +effective adaptation pathway for LMMs, broadening their applicability. The +source code is publicly available at: +\url{https://github.com/longkukuhi/MultiWay-Adapter}. + +
+
+
+
+
+ + ♻ ☆ On Penalty-based Bilevel Gradient Descent Method + + +
+ Bilevel optimization enjoys a wide range of applications in hyper-parameter +optimization, meta-learning and reinforcement learning. However, bilevel +optimization problems are difficult to solve. Recent progress on scalable +bilevel algorithms mainly focuses on bilevel optimization problems where the +lower-level objective is either strongly convex or unconstrained. In this work, +we tackle the bilevel problem through the lens of the penalty method. We show +that under certain conditions, the penalty reformulation recovers the solutions +of the original bilevel problem. Further, we propose the penalty-based bilevel +gradient descent (PBGD) algorithm and establish its finite-time convergence for +the constrained bilevel problem without lower-level strong convexity. +Experiments showcase the efficiency of the proposed PBGD algorithm. + +
+
+ comment: Improved Section 4 by removing a critical assumption; Added Section 5 + and citations +
+
+
+
+
+ + ♻ ☆ Adversaries with Limited Information in the Friedkin--Johnsen Model KDD'23 + + +
+ In recent years, online social networks have been the target of adversaries +who seek to introduce discord into societies, to undermine democracies and to +destabilize communities. Often the goal is not to favor a certain side of a +conflict but to increase disagreement and polarization. To get a mathematical +understanding of such attacks, researchers use opinion-formation models from +sociology, such as the Friedkin--Johnsen model, and formally study how much +discord the adversary can produce when altering the opinions for only a small +set of users. In this line of work, it is commonly assumed that the adversary +has full knowledge about the network topology and the opinions of all users. +However, the latter assumption is often unrealistic in practice, where user +opinions are not available or simply difficult to estimate accurately. + To address this concern, we raise the following question: Can an attacker sow +discord in a social network, even when only the network topology is known? We +answer this question affirmatively. We present approximation algorithms for +detecting a small set of users who are highly influential for the disagreement +and polarization in the network. We show that when the adversary radicalizes +these users and if the initial disagreement/polarization in the network is not +very high, then our method gives a constant-factor approximation on the setting +when the user opinions are known. To find the set of influential users, we +provide a novel approximation algorithm for a variant of MaxCut in graphs with +positive and negative edge weights. We experimentally evaluate our methods, +which have access only to the network topology, and we find that they have +similar performance as methods that have access to the network topology and all +user opinions. We further present an NP-hardness proof, which was an open +question by Chen and Racz [IEEE Trans. Netw. Sci. Eng., 2021]. + +
+
+ comment: KDD'23 +
+
+
+
+
+ + ♻ ☆ Primal-Dual Contextual Bayesian Optimization for Control System Online + Optimization with Time-Average Constraints + + +
+ This paper studies the problem of online performance optimization of +constrained closed-loop control systems, where both the objective and the +constraints are unknown black-box functions affected by exogenous time-varying +contextual disturbances. A primal-dual contextual Bayesian optimization +algorithm is proposed that achieves sublinear cumulative regret with respect to +the dynamic optimal solution under certain regularity conditions. Furthermore, +the algorithm achieves zero time-average constraint violation, ensuring that +the average value of the constraint function satisfies the desired constraint. +The method is applied to both sampled instances from Gaussian processes and a +continuous stirred tank reactor parameter tuning problem; simulation results +show that the method simultaneously provides close-to-optimal performance and +maintains constraint feasibility on average. This contrasts current +state-of-the-art methods, which either suffer from large cumulative regret or +severe constraint violations for the case studies presented. + +
+
+
+
+
+ + ♻ ☆ Solar Coronal Hole Analysis and Prediction using Computer Vision and + LSTM Neural Network + + +
+ As humanity has begun to explore space, the significance of space weather has +become apparent. It has been established that coronal holes, a type of space +weather phenomenon, can impact the operation of aircraft and satellites. The +coronal hole is an area on the sun characterized by open magnetic field lines +and relatively low temperatures, which result in the emission of the solar wind +at higher than average rates. In this study, To prepare for the impact of +coronal holes on the Earth, we use computer vision to detect the coronal hole +region and calculate its size based on images from the Solar Dynamics +Observatory (SDO). We compare the coronal holes for each region of the Sun and +analyze the correlation. We then implement deep learning techniques, +specifically the Long Short-Term Memory (LSTM) method, to analyze trends in the +coronal hole area data and predict its size for different sun regions over 7 +days. By analyzing time series data on the coronal hole area, this study aims +to identify patterns and trends in coronal hole behavior and understand how +they may impact space weather events. This research represents an important +step towards improving our ability to predict and prepare for space weather +events that can affect Earth and technological systems. + +
+
+ comment: This is old technology +
+
+
+
+
+ + ♻ ☆ Sign and Relevance Learning + + +
+ Standard models of biologically realistic or biologically inspired +reinforcement learning employ a global error signal, which implies the use of +shallow networks. On the other hand, error backpropagation allows the use of +networks with multiple layers. However, precise error backpropagation is +difficult to justify in biologically realistic networks because it requires +precise weighted error backpropagation from layer to layer. In this study, we +introduce a novel network that solves this problem by propagating only the sign +of the plasticity change (i.e., LTP/LTD) throughout the whole network, while +neuromodulation controls the learning rate. Neuromodulation can be understood +as a rectified error or relevance signal, while the top-down sign of the error +signal determines whether long-term potentiation or long-term depression will +occur. To demonstrate the effectiveness of this approach, we conducted a real +robotic task as proof of concept. Our results show that this paradigm can +successfully perform complex tasks using a biologically plausible learning +mechanism. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+
+
+
+ + Multimedia 9 + +
+
+
+ + ☆ Fg-T2M: Fine-Grained Text-Driven Human Motion Generation via Diffusion + Model + + +
+ Text-driven human motion generation in computer vision is both significant +and challenging. However, current methods are limited to producing either +deterministic or imprecise motion sequences, failing to effectively control the +temporal and spatial relationships required to conform to a given text +description. In this work, we propose a fine-grained method for generating +high-quality, conditional human motion sequences supporting precise text +description. Our approach consists of two key components: 1) a +linguistics-structure assisted module that constructs accurate and complete +language feature to fully utilize text information; and 2) a context-aware +progressive reasoning module that learns neighborhood and overall semantic +linguistics features from shallow and deep graph neural networks to achieve a +multi-step inference. Experiments show that our approach outperforms +text-driven motion generation methods on HumanML3D and KIT test sets and +generates better visually confirmed motion to the text conditions. + +
+
+
+
+
+ + ☆ Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation + + +
+ One primary topic of multi-modal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multi-modal cooperation, which could not jointly +utilize all modalities well. Some methods are proposed to identify and enhance +the worse learnt modality, but are often hard to provide the fine-grained +observation of multi-modal cooperation at sample-level with theoretical +support. Hence, it is essential to reasonably observe and improve the +fine-grained cooperation between modalities, especially when facing realistic +scenarios where the modality discrepancy could vary across different samples. +To this end, we introduce a fine-grained modality valuation metric to evaluate +the contribution of each modality at sample-level. Via modality valuation, we +regretfully observe that the multi-modal model tends to rely on one specific +modality, resulting in other modalities being low-contributing. We further +analyze this issue and improve cooperation between modalities by enhancing the +discriminative ability of low-contributing modalities in a targeted manner. +Overall, our methods reasonably observe the fine-grained uni-modal contribution +at sample-level and achieve considerable improvement on different multi-modal +models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ Dual-Path Temporal Map Optimization for Make-up Temporal Video Grounding + + +
+ Make-up temporal video grounding (MTVG) aims to localize the target video +segment which is semantically related to a sentence describing a make-up +activity, given a long video. Compared with the general video grounding task, +MTVG focuses on meticulous actions and changes on the face. The make-up +instruction step, usually involving detailed differences in products and facial +areas, is more fine-grained than general activities (e.g, cooking activity and +furniture assembly). Thus, existing general approaches cannot locate the target +activity effectually. More specifically, existing proposal generation modules +are not yet fully developed in providing semantic cues for the more +fine-grained make-up semantic comprehension. To tackle this issue, we propose +an effective proposal-based framework named Dual-Path Temporal Map Optimization +Network (DPTMO) to capture fine-grained multimodal semantic details of make-up +activities. DPTMO extracts both query-agnostic and query-guided features to +construct two proposal sets and uses specific evaluation methods for the two +sets. Different from the commonly used single structure in previous methods, +our dual-path structure can mine more semantic information in make-up videos +and distinguish fine-grained actions well. These two candidate sets represent +the cross-modal makeup video-text similarity and multi-modal fusion +relationship, complementing each other. Each set corresponds to its respective +optimization perspective, and their joint prediction enhances the accuracy of +video timestamp prediction. Comprehensive experiments on the YouMakeup dataset +demonstrate our proposed dual structure excels in fine-grained semantic +comprehension. + +
+
+
+
+
+ + ☆ Language Models as Black-Box Optimizers for Vision-Language Models + + +
+ Vision-language models (VLMs) pre-trained on web-scale datasets have +demonstrated remarkable capabilities across a variety of vision and multimodal +tasks. Currently, fine-tuning methods for VLMs mainly operate in a white-box +setting, requiring access to model parameters for backpropagation. However, +many VLMs rely on proprietary data and are not open-source, which restricts the +use of white-box approaches for fine-tuning. Given that popular private large +language models (LLMs) like ChatGPT still offer a language-based user +interface, we aim to develop a novel fine-tuning approach for VLMs through +natural language prompts, thereby avoiding the need to access model parameters, +feature embeddings, or output logits. In this setup, we propose employing +chat-based LLMs as black-box optimizers to search for the best text prompt on +the illustrative task of few-shot image classification using CLIP. +Specifically, we adopt an automatic "hill-climbing" procedure that converges on +an effective prompt by evaluating the accuracy of current prompts and asking +LLMs to refine them based on textual feedback, all within a conversational +process without human-in-the-loop. In a challenging 1-shot learning setup, our +simple approach surpasses the white-box continuous prompting method CoOp by an +average of 1.5% across 11 datasets including ImageNet. Our approach also +outperforms OpenAI's manually crafted prompts and is more efficient than other +black-box methods like iterative APE. Additionally, we highlight the advantage +of conversational feedback incorporating both positive and negative prompts, +suggesting that LLMs can utilize the implicit "gradient" direction in textual +feedback for a more efficient search. Lastly, we find that the text prompts +generated through our strategy are not only more interpretable but also +transfer well across different CLIP architectures in a black-box manner. + +
+
+
+
+
+ + ☆ DF-TransFusion: Multimodal Deepfake Detection via Lip-Audio + Cross-Attention and Facial Self-Attention + + +
+ With the rise in manipulated media, deepfake detection has become an +imperative task for preserving the authenticity of digital content. In this +paper, we present a novel multi-modal audio-video framework designed to +concurrently process audio and video inputs for deepfake detection tasks. Our +model capitalizes on lip synchronization with input audio through a +cross-attention mechanism while extracting visual cues via a fine-tuned VGG-16 +network. Subsequently, a transformer encoder network is employed to perform +facial self-attention. We conduct multiple ablation studies highlighting +different strengths of our approach. Our multi-modal methodology outperforms +state-of-the-art multi-modal deepfake detection techniques in terms of F-1 and +per-video AUC scores. + +
+
+
+
+
+ + ♻ ☆ Leveraging Automatic Personalised Nutrition: Food Image Recognition + Benchmark and Dataset based on Nutrition Taxonomy + + +
+ Leading a healthy lifestyle has become one of the most challenging goals in +today's society due to our sedentary lifestyle and poor eating habits. As a +result, national and international organisms have made numerous efforts to +promote healthier food diets and physical activity habits. However, these +recommendations are sometimes difficult to follow in our daily life and they +are also based on a general population. As a consequence, a new area of +research, personalised nutrition, has been conceived focusing on individual +solutions through smart devices and Artificial Intelligence (AI) methods. + This study presents the AI4Food-NutritionDB database, the first nutrition +database that considers food images and a nutrition taxonomy based on +recommendations by national and international organisms. In addition, four +different categorisation levels are considered following nutrition experts: 6 +nutritional levels, 19 main categories (e.g., "Meat"), 73 subcategories (e.g., +"White Meat"), and 893 final food products (e.g., "Chicken"). The +AI4Food-NutritionDB opens the doors to new food computing approaches in terms +of food intake frequency, quality, and categorisation. Also, in addition to the +database, we propose a standard experimental protocol and benchmark including +three tasks based on the nutrition taxonomy (i.e., category, subcategory, and +final product) to be used for the research community. Finally, we also release +our Deep Learning models trained with the AI4Food-NutritionDB, which can be +used as pre-trained models, achieving accurate recognition results with +challenging food image databases. + +
+
+ comment: 10 pages, 3 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Multi-Modality Multi-Loss Fusion Network + + +
+ In this work we investigate the optimal selection and fusion of features +across multiple modalities and combine these in a neural network to improve +emotion detection. We compare different fusion methods and examine the impact +of multi-loss training within the multi-modality fusion network, identifying +useful findings relating to subnet performance. Our best model achieves +state-of-the-art performance for three datasets (CMU-MOSI, CMU-MOSEI and +CH-SIMS), and outperforms the other methods in most metrics. We have found that +training on multimodal features improves single modality testing and designing +fusion methods based on dataset annotation schema enhances model performance. +These results suggest a roadmap towards an optimized feature selection and +fusion approach for enhancing emotion detection in neural networks. + +
+
+ comment: First two authors contributed equally to the paper +
+
+
+
+
+ + ♻ ☆ Temporal Dynamic Synchronous Functional Brain Network for Schizophrenia + Diagnosis and Lateralization Analysis + + +
+ The available evidence suggests that dynamic functional connectivity (dFC) +can capture time-varying abnormalities in brain activity in resting-state +cerebral functional magnetic resonance imaging (rs-fMRI) data and has a natural +advantage in uncovering mechanisms of abnormal brain activity in +schizophrenia(SZ) patients. Hence, an advanced dynamic brain network analysis +model called the temporal brain category graph convolutional network +(Temporal-BCGCN) was employed. Firstly, a unique dynamic brain network analysis +module, DSF-BrainNet, was designed to construct dynamic synchronization +features. Subsequently, a revolutionary graph convolution method, TemporalConv, +was proposed, based on the synchronous temporal properties of feature. Finally, +the first modular abnormal hemispherical lateralization test tool in deep +learning based on rs-fMRI data, named CategoryPool, was proposed. This study +was validated on COBRE and UCLA datasets and achieved 83.62% and 89.71% average +accuracies, respectively, outperforming the baseline model and other +state-of-the-art methods. The ablation results also demonstrate the advantages +of TemporalConv over the traditional edge feature graph convolution approach +and the improvement of CategoryPool over the classical graph pooling approach. +Interestingly, this study showed that the lower order perceptual system and +higher order network regions in the left hemisphere are more severely +dysfunctional than in the right hemisphere in SZ and reaffirms the importance +of the left medial superior frontal gyrus in SZ. Our core code is available at: +https://github.com/swfen/Temporal-BCGCN. + +
+
+
+
+
+ + ♻ ☆ MultiWay-Adapater: Adapting large-scale multi-modal models for scalable + image-text retrieval + + +
+ As the size of Large Multi-Modal Models (LMMs) increases consistently, the +adaptation of these pre-trained models to specialized tasks has become a +computationally and memory-intensive challenge. Traditional fine-tuning methods +require isolated, exhaustive retuning for each new task, limiting the models' +versatility. Moreover, current efficient adaptation techniques often overlook +modality alignment, focusing only on the knowledge extraction of new tasks. To +tackle these issues, we introduce Multiway-Adapter, an innovative framework +incorporating an 'Alignment Enhancer' to deepen modality alignment, enabling +high transferability without tuning pre-trained parameters. Our method adds +fewer than 1.25\% of additional parameters to LMMs, exemplified by the BEiT-3 +model in our study. This leads to superior zero-shot image-text retrieval +performance compared to fully fine-tuned models, while achieving up to a 57\% +reduction in fine-tuning time. Our approach offers a resource-efficient and +effective adaptation pathway for LMMs, broadening their applicability. The +source code is publicly available at: +\url{https://github.com/longkukuhi/MultiWay-Adapter}. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 66 + +
+
+
+ + ☆ Hypothesis Search: Inductive Reasoning with Language Models + + +
+ Inductive reasoning is a core problem-solving capacity: humans can identify +underlying principles from a few examples, which can then be robustly +generalized to novel scenarios. Recent work has evaluated large language models +(LLMs) on inductive reasoning tasks by directly prompting them yielding "in +context learning." This can work well for straightforward inductive tasks, but +performs very poorly on more complex tasks such as the Abstraction and +Reasoning Corpus (ARC). In this work, we propose to improve the inductive +reasoning ability of LLMs by generating explicit hypotheses at multiple levels +of abstraction: we prompt the LLM to propose multiple abstract hypotheses about +the problem, in natural language, then implement the natural language +hypotheses as concrete Python programs. These programs can be directly verified +by running on the observed examples and generalized to novel inputs. Because of +the prohibitive cost of generation with state-of-the-art LLMs, we consider a +middle step to filter the set of hypotheses that will be implemented into +programs: we either ask the LLM to summarize into a smaller set of hypotheses, +or ask human annotators to select a subset of the hypotheses. We verify our +pipeline's effectiveness on the ARC visual inductive reasoning benchmark, its +variant 1D-ARC, and string transformation dataset SyGuS. On a random 40-problem +subset of ARC, our automated pipeline using LLM summaries achieves 27.5% +accuracy, significantly outperforming the direct prompting baseline (accuracy +of 12.5%). With the minimal human input of selecting from LLM-generated +candidates, the performance is boosted to 37.5%. (And we argue this is a lower +bound on the performance of our approach without filtering.) Our ablation +studies show that abstract hypothesis generation and concrete program +representations are both beneficial for LLMs to perform inductive reasoning +tasks. + +
+
+
+
+
+ + ☆ MAmmoTH: Building Math Generalist Models through Hybrid Instruction + Tuning + + +
+ We introduce MAmmoTH, a series of open-source large language models (LLMs) +specifically tailored for general math problem-solving. The MAmmoTH models are +trained on MathInstruct, our meticulously curated instruction tuning dataset. +MathInstruct is compiled from 13 math datasets with intermediate rationales, +six of which have rationales newly curated by us. It presents a unique hybrid +of chain-of-thought (CoT) and program-of-thought (PoT) rationales, and also +ensures extensive coverage of diverse fields in math. The hybrid of CoT and PoT +not only unleashes the potential of tool use but also allows different thought +processes for different math problems. As a result, the MAmmoTH series +substantially outperform existing open-source models on nine mathematical +reasoning datasets across all scales with an average accuracy gain between 13% +and 29%. Remarkably, our MAmmoTH-7B model reaches 35% on MATH (a +competition-level dataset), which exceeds the best open-source 7B model +(WizardMath) by 25%, and the MAmmoTH-34B model achieves 46% accuracy on MATH, +even surpassing GPT-4's CoT result. Our work underscores the importance of +diverse problem coverage and the use of hybrid rationales in developing +superior math generalist models. + +
+
+ comment: Work in progress; Xiang Yue and Wenhu Chen contributed equally to + this paper +
+
+
+
+
+ + ☆ Effective Proxy for Human Labeling: Ensemble Disagreement Scores in + Large Language Models for Industrial NLP + + +
+ Large language models (LLMs) have demonstrated significant capability to +generalize across a large number of NLP tasks. For industry applications, it is +imperative to assess the performance of the LLM on unlabeled production data +from time to time to validate for a real-world setting. Human labeling to +assess model error requires considerable expense and time delay. Here we +demonstrate that ensemble disagreement scores work well as a proxy for human +labeling for language models in zero-shot, few-shot, and fine-tuned settings, +per our evaluation on keyphrase extraction (KPE) task. We measure fidelity of +the results by comparing to true error measured from human labeled ground +truth. We contrast with the alternative of using another LLM as a source of +machine labels, or silver labels. Results across various languages and domains +show disagreement scores provide a better estimation of model performance with +mean average error (MAE) as low as 0.4% and on average 13.8% better than using +silver labels. + +
+
+
+
+
+ + ☆ Incorporating Pre-trained Model Prompting in Multimodal Stock Volume + Movement Prediction KDD + + +
+ Multimodal stock trading volume movement prediction with stock-related news +is one of the fundamental problems in the financial area. Existing multimodal +works that train models from scratch face the problem of lacking universal +knowledge when modeling financial news. In addition, the models ability may be +limited by the lack of domain-related knowledge due to insufficient data in the +datasets. To handle this issue, we propose the Prompt-based MUltimodal Stock +volumE prediction model (ProMUSE) to process text and time series modalities. +We use pre-trained language models for better comprehension of financial news +and adopt prompt learning methods to leverage their capability in universal +knowledge to model textual information. Besides, simply fusing two modalities +can cause harm to the unimodal representations. Thus, we propose a novel +cross-modality contrastive alignment while reserving the unimodal heads beside +the fusion head to mitigate this problem. Extensive experiments demonstrate +that our proposed ProMUSE outperforms existing baselines. Comprehensive +analyses further validate the effectiveness of our architecture compared to +potential variants and learning mechanisms. + +
+
+ comment: 9 pages, 3 figures, 7 tables. Accepted by 2023 KDD Workshop on + Machine Learning in Finance +
+
+
+
+
+ + ☆ Memory Injections: Correcting Multi-Hop Reasoning Failures during + Inference in Transformer-Based Language Models + + +
+ Answering multi-hop reasoning questions requires retrieving and synthesizing +information from diverse sources. Large Language Models (LLMs) struggle to +perform such reasoning consistently. Here we propose an approach to pinpoint +and rectify multi-hop reasoning failures through targeted memory injections on +LLM attention heads. First, we analyze the per-layer activations of GPT-2 +models in response to single and multi-hop prompts. We then propose a mechanism +that allows users to inject pertinent prompt-specific information, which we +refer to as "memories," at critical LLM locations during inference. By thus +enabling the LLM to incorporate additional relevant information during +inference, we enhance the quality of multi-hop prompt completions. We show +empirically that a simple, efficient, and targeted memory injection into a key +attention layer can often increase the probability of the desired next token in +multi-hop tasks, by up to 424%. + +
+
+
+
+
+ + ☆ ITI-GEN: Inclusive Text-to-Image Generation ICCV 2023 + + +
+ Text-to-image generative models often reflect the biases of the training +data, leading to unequal representations of underrepresented groups. This study +investigates inclusive text-to-image generative models that generate images +based on human-written prompts and ensure the resulting images are uniformly +distributed across attributes of interest. Unfortunately, directly expressing +the desired attributes in the prompt often leads to sub-optimal results due to +linguistic ambiguity or model misrepresentation. Hence, this paper proposes a +drastically different approach that adheres to the maxim that "a picture is +worth a thousand words". We show that, for some attributes, images can +represent concepts more expressively than text. For instance, categories of +skin tones are typically hard to specify by text but can be easily represented +by example images. Building upon these insights, we propose a novel approach, +ITI-GEN, that leverages readily available reference images for Inclusive +Text-to-Image GENeration. The key idea is learning a set of prompt embeddings +to generate images that can effectively represent all desired attribute +categories. More importantly, ITI-GEN requires no model fine-tuning, making it +computationally efficient to augment existing text-to-image models. Extensive +experiments demonstrate that ITI-GEN largely improves over state-of-the-art +models to generate inclusive images from a prompt. Project page: +https://czhang0528.github.io/iti-gen. + +
+
+ comment: Accepted to ICCV 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ An Empirical Study of NetOps Capability of Pre-Trained Large Language + Models + + +
+ Large language models (LLMs) can respond to human language queries and have +shown powerful potential applications in network operations (NetOps). Thanks to +the large amount of commonsense knowledge inherent, LLMs achieve much better +inference accuracy than traditional models and emerge with strong abilities in +generalization, reasoning, and code generation. These abilities may have a +crucial boost to automated and intelligent NetOps. However, it remains +under-explored how well LLMs perform in various NetOps tasks. In this work, we +make a systematic assessment of the capabilities, strengths, and limitations of +selected LLMs in the field of NetOps. The evaluation is conducted on a +collection of 5,732 questions about NetOps, encompassing 26 publicly available +general-domain LLMs, including ChatGPT, LLaMA, Falcon, etc. We also finetune +some of these LLMs with our collected NetOps corpus and evaluate the resulting +models. The evaluation method follows the widely adopted benchmarks for +general-domain LLMs, combined with Chain-of-Thought Prompts and +Retrieval-Augmented Generation. The results show that only GPT-4 achieves high +accuracy equivalent to passing the NetOps certification exam for humans, while +all the other LLMs have much lower accuracy. However, some open models like +LLaMA 2 still demonstrate significant potential. Furthermore, we evaluate the +impact of factors such as model parameters, prompt engineering, instruction +fine-tuning etc. This work shall be treated as the initial effort to systematic +evaluation of LLMs in NetOps, and a more rigorous study is required for +production use. The evaluation code and dataset will be released to benefit +future research. + +
+
+
+
+
+ + ☆ Kani: A Lightweight and Highly Hackable Framework for Building Language + Model Applications + + +
+ Language model applications are becoming increasingly popular and complex, +often including features like tool usage and retrieval augmentation. However, +existing frameworks for such applications are often opinionated, deciding for +developers how their prompts ought to be formatted and imposing limitations on +customizability and reproducibility. To solve this we present Kani: a +lightweight, flexible, and model-agnostic open-source framework for building +language model applications. Kani helps developers implement a variety of +complex features by supporting the core building blocks of chat interaction: +model interfacing, chat management, and robust function calling. All Kani core +functions are easily overridable and well documented to empower developers to +customize functionality for their own needs. Kani thus serves as a useful tool +for researchers, hobbyists, and industry professionals alike to accelerate +their development while retaining interoperability and fine-grained control. + +
+
+ comment: In submission to NLP-OSS +
+
+
+
+
+ + ☆ PAI-Diffusion: Constructing and Serving a Family of Open Chinese + Diffusion Models for Text-to-image Synthesis on the Cloud + + +
+ Text-to-image synthesis for the Chinese language poses unique challenges due +to its large vocabulary size, and intricate character relationships. While +existing diffusion models have shown promise in generating images from textual +descriptions, they often neglect domain-specific contexts and lack robustness +in handling the Chinese language. This paper introduces PAI-Diffusion, a +comprehensive framework that addresses these limitations. PAI-Diffusion +incorporates both general and domain-specific Chinese diffusion models, +enabling the generation of contextually relevant images. It explores the +potential of using LoRA and ControlNet for fine-grained image style transfer +and image editing, empowering users with enhanced control over image +generation. Moreover, PAI-Diffusion seamlessly integrates with Alibaba Cloud's +Machine Learning Platform for AI, providing accessible and scalable solutions. +All the Chinese diffusion model checkpoints, LoRAs, and ControlNets, including +domain-specific ones, are publicly available. A user-friendly Chinese WebUI and +the diffusers-api elastic inference toolkit, also open-sourced, further +facilitate the easy deployment of PAI-Diffusion models in various environments, +making it a valuable resource for Chinese text-to-image synthesis. + +
+
+
+
+
+ + ☆ NExT-GPT: Any-to-Any Multimodal LLM + + +
+ While recently Multimodal Large Language Models (MM-LLMs) have made exciting +strides, they mostly fall prey to the limitation of only input-side multimodal +understanding, without the ability to produce content in multiple modalities. +As we humans always perceive the world and communicate with people through +various modalities, developing any-to-any MM-LLMs capable of accepting and +delivering content in any modality becomes essential to human-level AI. To fill +the gap, we present an end-to-end general-purpose any-to-any MM-LLM system, +NExT-GPT. We connect an LLM with multimodal adaptors and different diffusion +decoders, enabling NExT-GPT to perceive inputs and generate outputs in +arbitrary combinations of text, images, videos, and audio. By leveraging the +existing well-trained highly-performing encoders and decoders, NExT-GPT is +tuned with only a small amount of parameter (1%) of certain projection layers, +which not only benefits low-cost training and also facilitates convenient +expansion to more potential modalities. Moreover, we introduce a +modality-switching instruction tuning (MosIT) and manually curate a +high-quality dataset for MosIT, based on which NExT-GPT is empowered with +complex cross-modal semantic understanding and content generation. Overall, our +research showcases the promising possibility of building an AI agent capable of +modeling universal modalities, paving the way for more human-like AI research +in the community. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Optimize Weight Rounding via Signed Gradient Descent for the + Quantization of LLMs + + +
+ Large Language Models (LLMs) have proven their exceptional capabilities in +performing language-related tasks. However, their deployment poses significant +challenges due to their considerable memory and storage requirements. In +response to this issue, weight-only quantization, particularly 3 and 4-bit +weight-only quantization, has emerged as one of the most viable solutions. As +the number of bits decreases, the quantization grid broadens, thus emphasizing +the importance of up and down rounding. While previous studies have +demonstrated that fine-tuning up and down rounding with the addition of +perturbations can enhance accuracy in some scenarios, our study is driven by +the precise and limited boundary of these perturbations, where only the +threshold for altering the rounding value is of significance. Consequently, we +propose a concise and highly effective approach for optimizing the weight +rounding task. Our method, named SignRound, involves lightweight block-wise +tuning using signed gradient descent, enabling us to achieve outstanding +results within 400 steps. SignRound outperforms the established baseline of +rounding-to-nearest (RTN) and competes impressively against recent methods, +without introducing additional inference overhead. The source code will be +publicly available at https://github.com/intel/neural-compressor soon. + +
+
+
+
+
+ + ☆ Long-Range Transformer Architectures for Document Understanding ICDAR 2023 + + +
+ Since their release, Transformers have revolutionized many fields from +Natural Language Understanding to Computer Vision. Document Understanding (DU) +was not left behind with first Transformer based models for DU dating from late +2019. However, the computational complexity of the self-attention operation +limits their capabilities to small sequences. In this paper we explore multiple +strategies to apply Transformer based models to long multi-page documents. We +introduce 2 new multi-modal (text + layout) long-range models for DU. They are +based on efficient implementations of Transformers for long sequences. +Long-range models can process whole documents at once effectively and are less +impaired by the document's length. We compare them to LayoutLM, a classical +Transformer adapted for DU and pre-trained on millions of documents. We further +propose 2D relative attention bias to guide self-attention towards relevant +tokens without harming model efficiency. We observe improvements on multi-page +business documents on Information Retrieval for a small performance cost on +smaller sequences. Relative 2D attention revealed to be effective on dense text +for both normal and long-range models. + +
+
+ comment: Conference: ICDAR 2023 Workshops on Document Analysis and Recognition +
+
+
+
+
+ + ☆ Black-Box Analysis: GPTs Across Time in Legal Textual Entailment Task + + +
+ The evolution of Generative Pre-trained Transformer (GPT) models has led to +significant advancements in various natural language processing applications, +particularly in legal textual entailment. We present an analysis of GPT-3.5 +(ChatGPT) and GPT-4 performances on COLIEE Task 4 dataset, a prominent +benchmark in this domain. The study encompasses data from Heisei 18 (2006) to +Reiwa 3 (2021), exploring the models' abilities to discern entailment +relationships within Japanese statute law across different periods. Our +preliminary experimental results unveil intriguing insights into the models' +strengths and weaknesses in handling legal textual entailment tasks, as well as +the patterns observed in model performance. In the context of proprietary +models with undisclosed architectures and weights, black-box analysis becomes +crucial for evaluating their capabilities. We discuss the influence of training +data distribution and the implications on the models' generalizability. This +analysis serves as a foundation for future research, aiming to optimize +GPT-based models and enable their successful adoption in legal information +extraction and entailment applications. + +
+
+ comment: ISAILD@KSE 2023 +
+
+
+
+
+ + ☆ NeCo@ALQAC 2023: Legal Domain Knowledge Acquisition for Low-Resource + Languages through Data Enrichment + + +
+ In recent years, natural language processing has gained significant +popularity in various sectors, including the legal domain. This paper presents +NeCo Team's solutions to the Vietnamese text processing tasks provided in the +Automated Legal Question Answering Competition 2023 (ALQAC 2023), focusing on +legal domain knowledge acquisition for low-resource languages through data +enrichment. Our methods for the legal document retrieval task employ a +combination of similarity ranking and deep learning models, while for the +second task, which requires extracting an answer from a relevant legal article +in response to a question, we propose a range of adaptive techniques to handle +different question types. Our approaches achieve outstanding results on both +tasks of the competition, demonstrating the potential benefits and +effectiveness of question answering systems in the legal field, particularly +for low-resource languages. + +
+
+ comment: ISAILD@KSE 2023 +
+
+
+
+
+ + ☆ Personality Detection and Analysis using Twitter Data + + +
+ Personality types are important in various fields as they hold relevant +information about the characteristics of a human being in an explainable +format. They are often good predictors of a person's behaviors in a particular +environment and have applications ranging from candidate selection to marketing +and mental health. Recently automatic detection of personality traits from +texts has gained significant attention in computational linguistics. Most +personality detection and analysis methods have focused on small datasets +making their experimental observations often limited. To bridge this gap, we +focus on collecting and releasing the largest automatically curated dataset for +the research community which has 152 million tweets and 56 thousand data points +for the Myers-Briggs personality type (MBTI) prediction task. We perform a +series of extensive qualitative and quantitative studies on our dataset to +analyze the data patterns in a better way and infer conclusions. We show how +our intriguing analysis results often follow natural intuition. We also perform +a series of ablation studies to show how the baselines perform for our dataset. + +
+
+ comment: Submitted to ASONAM 2023 +
+
+
+
+
+ + ☆ CrisisTransformers: Pre-trained language models and sentence encoders + for crisis-related social media texts + + +
+ Social media platforms play an essential role in crisis communication, but +analyzing crisis-related social media texts is challenging due to their +informal nature. Transformer-based pre-trained models like BERT and RoBERTa +have shown success in various NLP tasks, but they are not tailored for +crisis-related texts. Furthermore, general-purpose sentence encoders are used +to generate sentence embeddings, regardless of the textual complexities in +crisis-related texts. Advances in applications like text classification, +semantic search, and clustering contribute to effective processing of +crisis-related texts, which is essential for emergency responders to gain a +comprehensive view of a crisis event, whether historical or real-time. To +address these gaps in crisis informatics literature, this study introduces +CrisisTransformers, an ensemble of pre-trained language models and sentence +encoders trained on an extensive corpus of over 15 billion word tokens from +tweets associated with more than 30 crisis events, including disease outbreaks, +natural disasters, conflicts, and other critical incidents. We evaluate +existing models and CrisisTransformers on 18 crisis-specific public datasets. +Our pre-trained models outperform strong baselines across all datasets in +classification tasks, and our best-performing sentence encoder improves the +state-of-the-art by 17.43% in sentence encoding tasks. Additionally, we +investigate the impact of model initialization on convergence and evaluate the +significance of domain-specific models in generating semantically meaningful +sentence embeddings. All models are publicly released +(https://huggingface.co/crisistransformers), with the anticipation that they +will serve as a robust baseline for tasks involving the analysis of +crisis-related social media texts. + +
+
+
+
+
+ + ☆ Zero-shot Learning with Minimum Instruction to Extract Social + Determinants and Family History from Clinical Notes using GPT Model + + +
+ Demographics, Social determinants of health, and family history documented in +the unstructured text within the electronic health records are increasingly +being studied to understand how this information can be utilized with the +structured data to improve healthcare outcomes. After the GPT models were +released, many studies have applied GPT models to extract this information from +the narrative clinical notes. Different from the existing work, our research +focuses on investigating the zero-shot learning on extracting this information +together by providing minimum information to the GPT model. We utilize +de-identified real-world clinical notes annotated for demographics, various +social determinants, and family history information. Given that the GPT model +might provide text different from the text in the original data, we explore two +sets of evaluation metrics, including the traditional NER evaluation metrics +and semantic similarity evaluation metrics, to completely understand the +performance. Our results show that the GPT-3.5 method achieved an average of +0.975 F1 on demographics extraction, 0.615 F1 on social determinants +extraction, and 0.722 F1 on family history extraction. We believe these results +can be further improved through model fine-tuning or few-shots learning. +Through the case studies, we also identified the limitations of the GPT models, +which need to be addressed in future research. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ LeBenchmark 2.0: a Standardized, Replicable and Enhanced Framework for + Self-supervised Representations of French Speech + + +
+ Self-supervised learning (SSL) is at the origin of unprecedented improvements +in many different domains including computer vision and natural language +processing. Speech processing drastically benefitted from SSL as most of the +current domain-related tasks are now being approached with pre-trained models. +This work introduces LeBenchmark 2.0 an open-source framework for assessing and +building SSL-equipped French speech technologies. It includes documented, +large-scale and heterogeneous corpora with up to 14,000 hours of heterogeneous +speech, ten pre-trained SSL wav2vec 2.0 models containing from 26 million to +one billion learnable parameters shared with the community, and an evaluation +protocol made of six downstream tasks to complement existing benchmarks. +LeBenchmark 2.0 also presents unique perspectives on pre-trained SSL models for +speech with the investigation of frozen versus fine-tuned downstream models, +task-agnostic versus task-specific pre-trained models as well as a discussion +on the carbon footprint of large-scale model training. + +
+
+ comment: Under submission at Computer Science and Language. Preprint allowed +
+
+
+
+
+ + ☆ Textbooks Are All You Need II: phi-1.5 technical report + + +
+ We continue the investigation into the power of smaller Transformer-based +language models as initiated by \textbf{TinyStories} -- a 10 million parameter +model that can produce coherent English -- and the follow-up work on +\textbf{phi-1}, a 1.3 billion parameter model with Python coding performance +close to the state-of-the-art. The latter work proposed to use existing Large +Language Models (LLMs) to generate ``textbook quality" data as a way to enhance +the learning process compared to traditional web data. We follow the +``Textbooks Are All You Need" approach, focusing this time on common sense +reasoning in natural language, and create a new 1.3 billion parameter model +named \textbf{phi-1.5}, with performance on natural language tasks comparable +to models 5x larger, and surpassing most non-frontier LLMs on more complex +reasoning tasks such as grade-school mathematics and basic coding. More +generally, \textbf{phi-1.5} exhibits many of the traits of much larger LLMs, +both good -- such as the ability to ``think step by step" or perform some +rudimentary in-context learning -- and bad, including hallucinations and the +potential for toxic and biased generations -- encouragingly though, we are +seeing improvement on that front thanks to the absence of web data. We +open-source \textbf{phi-1.5} to promote further research on these urgent +topics. + +
+
+
+
+
+ + ☆ Flesch or Fumble? Evaluating Readability Standard Alignment of + Instruction-Tuned Language Models + + +
+ Readability metrics and standards such as Flesch Kincaid Grade Level (FKGL) +and the Common European Framework of Reference for Languages (CEFR) exist to +guide teachers and educators to properly assess the complexity of educational +materials before administering them for classroom use. In this study, we select +a diverse set of open and closed-source instruction-tuned language models and +investigate their performances in writing story completions and simplifying +narratives$-$tasks that teachers perform$-$using standard-guided prompts +controlling text readability. Our extensive findings provide empirical proof of +how globally recognized models like ChatGPT may be considered less effective +and may require more refined prompts for these generative tasks compared to +other open-sourced models such as BLOOMZ and FlanT5$-$which have shown +promising results. + +
+
+
+
+
+ + ☆ Evaluating the Deductive Competence of Large Language Models + + +
+ The development of highly fluent large language models (LLMs) has prompted +increased interest in assessing their reasoning and problem-solving +capabilities. We investigate whether several LLMs can solve a classic type of +deductive reasoning problem from the cognitive science literature. The tested +LLMs have limited abilities to solve these problems in their conventional form. +We performed follow up experiments to investigate if changes to the +presentation format and content improve model performance. We do find +performance differences between conditions; however, they do not improve +overall performance. Moreover, we find that performance interacts with +presentation format and content in unexpected ways that differ from human +performance. Overall, our results suggest that LLMs have unique reasoning +biases that are only partially predicted from human reasoning performance. + +
+
+ comment: 7 pages, 7 figures, under review +
+
+
+
+
+ + ☆ Panoptic Vision-Language Feature Fields + + +
+ Recently, methods have been proposed for 3D open-vocabulary semantic +segmentation. Such methods are able to segment scenes into arbitrary classes +given at run-time using their text description. In this paper, we propose to +our knowledge the first algorithm for open-vocabulary panoptic segmentation, +simultaneously performing both semantic and instance segmentation. Our +algorithm, Panoptic Vision-Language Feature Fields (PVLFF) learns a feature +field of the scene, jointly learning vision-language features and hierarchical +instance features through a contrastive loss function from 2D instance segment +proposals on input frames. Our method achieves comparable performance against +the state-of-the-art close-set 3D panoptic systems on the HyperSim, ScanNet and +Replica dataset and outperforms current 3D open-vocabulary systems in terms of +semantic segmentation. We additionally ablate our method to demonstrate the +effectiveness of our model architecture. Our code will be available at +https://github.com/ethz-asl/autolabel. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ TeGit: Generating High-Quality Instruction-Tuning Data with + Text-Grounded Task Design + + +
+ High-quality instruction-tuning data is critical to improving LLM +capabilities. Existing data collection methods are limited by unrealistic +manual labeling costs or by the hallucination of relying solely on LLM +generation. To address the problems, this paper presents a scalable method to +automatically collect high-quality instructional adaptation data by training +language models to automatically design tasks based on human-written texts. +Intuitively, human-written text helps to help the model attenuate illusions +during the generation of tasks. Unlike instruction back-translation-based +methods that directly take the given text as a response, we require the model +to generate the \textit{instruction}, \textit{input}, and \textit{output} +simultaneously to filter the noise. The results of the automated and manual +evaluation experiments demonstrate the quality of our dataset. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Pushing Mixture of Experts to the Limit: Extremely Parameter Efficient + MoE for Instruction Tuning + + +
+ The Mixture of Experts (MoE) is a widely known neural architecture where an +ensemble of specialized sub-models optimizes overall performance with a +constant computational cost. However, conventional MoEs pose challenges at +scale due to the need to store all experts in memory. In this paper, we push +MoE to the limit. We propose extremely parameter-efficient MoE by uniquely +combining MoE architecture with lightweight experts.Our MoE architecture +outperforms standard parameter-efficient fine-tuning (PEFT) methods and is on +par with full fine-tuning by only updating the lightweight experts -- less than +1% of an 11B parameters model. Furthermore, our method generalizes to unseen +tasks as it does not depend on any prior task knowledge. Our research +underscores the versatility of the mixture of experts architecture, showcasing +its ability to deliver robust performance even when subjected to rigorous +parameter constraints. Our code used in all the experiments is publicly +available here: https://github.com/for-ai/parameter-efficient-moe. + +
+
+
+
+
+ + ☆ Improving Information Extraction on Business Documents with Specific + Pre-Training Tasks + + +
+ Transformer-based Language Models are widely used in Natural Language +Processing related tasks. Thanks to their pre-training, they have been +successfully adapted to Information Extraction in business documents. However, +most pre-training tasks proposed in the literature for business documents are +too generic and not sufficient to learn more complex structures. In this paper, +we use LayoutLM, a language model pre-trained on a collection of business +documents, and introduce two new pre-training tasks that further improve its +capacity to extract relevant information. The first is aimed at better +understanding the complex layout of documents, and the second focuses on +numeric values and their order of magnitude. These tasks force the model to +learn better-contextualized representations of the scanned documents. We +further introduce a new post-processing algorithm to decode BIESO tags in +Information Extraction that performs better with complex entities. Our method +significantly improves extraction performance on both public (from 93.88 to +95.50 F1 score) and private (from 84.35 to 84.84 F1 score) datasets composed of +expense receipts, invoices, and purchase orders. + +
+
+ comment: Conference: Document Analysis Systems. DAS 2022 +
+
+
+
+
+ + ☆ Multi-Modal Automatic Prosody Annotation with Contrastive Pretraining of + SSWP ICASSP 2024 + + +
+ In the realm of expressive Text-to-Speech (TTS), explicit prosodic boundaries +significantly advance the naturalness and controllability of synthesized +speech. While human prosody annotation contributes a lot to the performance, it +is a labor-intensive and time-consuming process, often resulting in +inconsistent outcomes. Despite the availability of extensive supervised data, +the current benchmark model still faces performance setbacks. To address this +issue, a two-stage automatic annotation pipeline is novelly proposed in this +paper. Specifically, in the first stage, we propose contrastive text-speech +pretraining of Speech-Silence and Word-Punctuation (SSWP) pairs. The +pretraining procedure hammers at enhancing the prosodic space extracted from +joint text-speech space. In the second stage, we build a multi-modal prosody +annotator, which consists of pretrained encoders, a straightforward yet +effective text-speech feature fusion scheme, and a sequence classifier. +Extensive experiments conclusively demonstrate that our proposed method excels +at automatically generating prosody annotation and achieves state-of-the-art +(SOTA) performance. Furthermore, our novel model has exhibited remarkable +resilience when tested with varying amounts of data. + +
+
+ comment: Submitted to ICASSP 2024 +
+
+
+
+
+ + ☆ Experimenting with UD Adaptation of an Unsupervised Rule-based Approach + for Sentiment Analysis of Mexican Tourist Texts + + +
+ This paper summarizes the results of experimenting with Universal +Dependencies (UD) adaptation of an Unsupervised, Compositional and Recursive +(UCR) rule-based approach for Sentiment Analysis (SA) submitted to the Shared +Task at Rest-Mex 2023 (Team Olga/LyS-SALSA) (within the IberLEF 2023 +conference). By using basic syntactic rules such as rules of modification and +negation applied on words from sentiment dictionaries, our approach exploits +some advantages of an unsupervised method for SA: (1) interpretability and +explainability of SA, (2) robustness across datasets, languages and domains and +(3) usability by non-experts in NLP. We compare our approach with other +unsupervised approaches of SA that in contrast to our UCR rule-based approach +use simple heuristic rules to deal with negation and modification. Our results +show a considerable improvement over these approaches. We discuss future +improvements of our results by using modality features as another shifting rule +of polarity and word disambiguation techniques to identify the right sentiment +words. + +
+
+ comment: Proceedings of IberLEF 2023, Ja\'en, Spain, 2023 +
+
+
+
+
+ + ☆ Analysing Cross-Lingual Transfer in Low-Resourced African Named Entity + Recognition AACL 2023 + + +
+ Transfer learning has led to large gains in performance for nearly all NLP +tasks while making downstream models easier and faster to train. This has also +been extended to low-resourced languages, with some success. We investigate the +properties of cross-lingual transfer learning between ten low-resourced +languages, from the perspective of a named entity recognition task. We +specifically investigate how much adaptive fine-tuning and the choice of +transfer language affect zero-shot transfer performance. We find that models +that perform well on a single language often do so at the expense of +generalising to others, while models with the best generalisation to other +languages suffer in individual language performance. Furthermore, the amount of +data overlap between the source and target datasets is a better predictor of +transfer performance than either the geographical or genetic distance between +the languages. + +
+
+ comment: Accepted to IJCNLP-AACL 2023 +
+
+
+
+
+ + ☆ Minuteman: Machine and Human Joining Forces in Meeting Summarization + + +
+ Many meetings require creating a meeting summary to keep everyone up to date. +Creating minutes of sufficient quality is however very cognitively demanding. +Although we currently possess capable models for both audio speech recognition +(ASR) and summarization, their fully automatic use is still problematic. ASR +models frequently commit errors when transcribing named entities while the +summarization models tend to hallucinate and misinterpret the transcript. We +propose a novel tool -- Minuteman -- to enable efficient semi-automatic meeting +minuting. The tool provides a live transcript and a live meeting summary to the +users, who can edit them in a collaborative manner, enabling correction of ASR +errors and imperfect summary points in real time. The resulting application +eases the cognitive load of the notetakers and allows them to easily catch up +if they missed a part of the meeting due to absence or a lack of focus. We +conduct several tests of the application in varied settings, exploring the +worthiness of the concept and the possible user strategies. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ CONFLATOR: Incorporating Switching Point based Rotatory Positional + Encodings for Code-Mixed Language Modeling + + +
+ The mixing of two or more languages is called Code-Mixing (CM). CM is a +social norm in multilingual societies. Neural Language Models (NLMs) like +transformers have been very effective on many NLP tasks. However, NLM for CM is +an under-explored area. Though transformers are capable and powerful, they +cannot always encode positional/sequential information since they are +non-recurrent. Therefore, to enrich word information and incorporate positional +information, positional encoding is defined. We hypothesize that Switching +Points (SPs), i.e., junctions in the text where the language switches (L1 -> L2 +or L2-> L1), pose a challenge for CM Language Models (LMs), and hence give +special emphasis to switching points in the modeling process. We experiment +with several positional encoding mechanisms and show that rotatory positional +encodings along with switching point information yield the best results. + We introduce CONFLATOR: a neural language modeling approach for code-mixed +languages. CONFLATOR tries to learn to emphasize switching points using smarter +positional encoding, both at unigram and bigram levels. CONFLATOR outperforms +the state-of-the-art on two tasks based on code-mixed Hindi and English +(Hinglish): (i) sentiment analysis and (ii) machine translation. + +
+
+
+
+
+ + ☆ Detecting Natural Language Biases with Prompt-based Learning + + +
+ In this project, we want to explore the newly emerging field of prompt +engineering and apply it to the downstream task of detecting LM biases. More +concretely, we explore how to design prompts that can indicate 4 different +types of biases: (1) gender, (2) race, (3) sexual orientation, and (4) +religion-based. Within our project, we experiment with different manually +crafted prompts that can draw out the subtle biases that may be present in the +language model. We apply these prompts to multiple variations of popular and +well-recognized models: BERT, RoBERTa, and T5 to evaluate their biases. We +provide a comparative analysis of these models and assess them using a two-fold +method: use human judgment to decide whether model predictions are biased and +utilize model-level judgment (through further prompts) to understand if a model +can self-diagnose the biases of its own prediction. + +
+
+
+
+
+ + ☆ Exploring the Law of Numbers: Evidence from China's Real Estate + + +
+ The renowned proverb, Numbers do not lie, underscores the reliability and +insight that lie beneath numbers, a concept of undisputed importance, +especially in economics and finance etc. Despite the prosperity of Benford's +Law in the first digit analysis, its scope fails to remain comprehensiveness +when it comes to deciphering the laws of number. This paper delves into number +laws by taking the financial statements of China real estate as a +representative, quantitatively study not only the first digit, but also depict +the other two dimensions of numbers: frequency and length. The research +outcomes transcend mere reservations about data manipulation and open the door +to discussions surrounding number diversity and the delineation of the usage +insights. This study wields both economic significance and the capacity to +foster a deeper comprehension of numerical phenomena. + +
+
+ comment: DSS +
+
+
+
+
+ + ☆ Quantifying and Attributing the Hallucination of Large Language Models + via Association Analysis + + +
+ Although demonstrating superb performance on various NLP tasks, large +language models (LLMs) still suffer from the hallucination problem, which +threatens the reliability of LLMs. To measure the level of hallucination of +LLMs, previous works first categorize the hallucination according to the +phenomenon similarity, then quantify the proportion that model outputs contain +hallucinatory contents. However, such hallucination rates could easily be +distorted by confounders. Moreover, such hallucination rates could not reflect +the reasons for the hallucination, as similar hallucinatory phenomena may +originate from different sources. To address these issues, we propose to +combine the hallucination level quantification and hallucination reason +investigation through an association analysis, which builds the relationship +between the hallucination rate of LLMs with a set of risk factors. In this way, +we are able to observe the hallucination level under each value of each risk +factor, examining the contribution and statistical significance of each risk +factor, meanwhile excluding the confounding effect of other factors. +Additionally, by recognizing the risk factors according to a taxonomy of model +capability, we reveal a set of potential deficiencies in commonsense +memorization, relational reasoning, and instruction following, which may +further provide guidance for the pretraining and supervised fine-tuning process +of LLMs to mitigate the hallucination. + +
+
+
+
+
+ + ☆ Understanding the Impact of Post-Training Quantization on Large-scale + Language Models + + +
+ Large language models (LLMs) are rapidly increasing in size, with the number +of parameters becoming a key factor in the success of many commercial models, +such as ChatGPT, Claude, and Bard. Even the recently released publicly +accessible models for commercial usage, such as Falcon and Llama2, come +equipped with billions of parameters. This significant increase in the number +of parameters makes deployment and operation very costly. The remarkable +progress in the field of quantization for large neural networks in general and +LLMs in particular, has made these models more accessible by enabling them to +be deployed on consumer-grade GPUs. Quantized models generally demonstrate +comparable performance levels to their unquantized base counterparts. +Nonetheless, there exists a notable gap in our comprehensive understanding of +how these quantized models respond to hyperparameters, such as temperature, max +new tokens, and top\_k, particularly during the decoding phase. The present +analysis reveals that nf4 and fp4 are equally proficient 4-bit quantization +techniques, characterized by similar attributes such as inference speed, memory +consumption, and the quality of generated content. Nevertheless, these +quantization methods exhibit distinct behaviors at varying temperature +settings, both in the context of smaller and larger models. It is noteworthy +that, in general, 4-bit quantized models of varying sizes exhibit heightened +sensitivity to lower temperature settings, unlike their unquantized +counterparts. Additionally, int8 quantization is associated with significantly +slower inference speeds, whereas unquantized fp16 models consistently yield the +fastest inference speeds across models of all sizes. + +
+
+
+
+
+ + ☆ From Artificially Real to Real: Leveraging Pseudo Data from Large + Language Models for Low-Resource Molecule Discovery + + +
+ Molecule discovery serves as a cornerstone in numerous scientific domains, +fueling the development of new materials and innovative drug designs. Recent +developments of in-silico molecule discovery have highlighted the promising +results of cross-modal techniques, which bridge molecular structures with their +descriptive annotations. However, these cross-modal methods frequently +encounter the issue of data scarcity, hampering their performance and +application. In this paper, we address the low-resource challenge by utilizing +artificially-real data generated by Large Language Models (LLMs). We first +introduce a retrieval-based prompting strategy to construct high-quality pseudo +data, then explore the optimal method to effectively leverage this pseudo data. +Experiments show that using pseudo data for domain adaptation outperforms all +existing methods, while also requiring a smaller model scale, reduced data size +and lower training cost, highlighting its efficiency. Furthermore, our method +shows a sustained improvement as the volume of pseudo data increases, revealing +the great potential of pseudo data in advancing low-resource cross-modal +molecule discovery. + +
+
+
+
+
+ + ☆ Two is Better Than One: Answering Complex Questions by Multiple + Knowledge Sources with Generalized Links + + +
+ Incorporating multiple knowledge sources is proven to be beneficial for +answering complex factoid questions. To utilize multiple knowledge bases (KB), +previous works merge all KBs into a single graph via entity alignment and +reduce the problem to question-answering (QA) over the fused KB. In reality, +various link relations between KBs might be adopted in QA over multi-KBs. In +addition to the identity between the alignable entities (i.e. full link), +unalignable entities expressing the different aspects or types of an abstract +concept may also be treated identical in a question (i.e. partial link). Hence, +the KB fusion in prior works fails to represent all types of links, restricting +their ability to comprehend multi-KBs for QA. In this work, we formulate the +novel Multi-KB-QA task that leverages the full and partial links among multiple +KBs to derive correct answers, a benchmark with diversified link and query +types is also constructed to efficiently evaluate Multi-KB-QA performance. +Finally, we propose a method for Multi-KB-QA that encodes all link relations in +the KB embedding to score and rank candidate answers. Experiments show that our +method markedly surpasses conventional KB-QA systems in Multi-KB-QA, justifying +the necessity of devising this task. + +
+
+
+
+
+ + ☆ Does Writing with Language Models Reduce Content Diversity? + + +
+ Large language models (LLMs) have led to a surge in collaborative writing +with model assistance. As different users incorporate suggestions from the same +model, there is a risk of decreased diversity in the produced content, +potentially limiting diverse perspectives in public discourse. In this work, we +measure the impact of co-writing on diversity via a controlled experiment, +where users write argumentative essays in three setups -- using a base LLM +(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We +develop a set of diversity metrics and find that writing with InstructGPT (but +not the GPT3) results in a statistically significant reduction in diversity. +Specifically, it increases the similarity between the writings of different +authors and reduces the overall lexical and content diversity. We additionally +find that this effect is mainly attributable to InstructGPT contributing less +diverse text to co-written essays. In contrast, the user-contributed text +remains unaffected by model collaboration. This suggests that the recent +improvement in generation quality from adapting models to human feedback might +come at the cost of more homogeneous and less diverse content. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning + + +
+ Prompt tuning (PT), where a small amount of trainable soft (continuous) +prompt vectors is affixed to the input of language models (LM), has shown +promising results across various tasks and models for parameter-efficient +fine-tuning (PEFT). PT stands out from other PEFT approaches because it +maintains competitive performance with fewer trainable parameters and does not +drastically scale up its parameters as the model size expands. However, PT +introduces additional soft prompt tokens, leading to longer input sequences, +which significantly impacts training and inference time and memory usage due to +the Transformer's quadratic complexity. Particularly concerning for Large +Language Models (LLMs) that face heavy daily querying. To address this issue, +we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt +into a shorter soft prompt and a pair of low-rank matrices that are then +optimised with two different learning rates. This allows DePT to achieve better +performance while saving over 20% memory and time costs compared to vanilla PT +and its variants, without changing trainable parameter sizes. Through extensive +experiments on 23 natural language processing (NLP) and vision-language (VL) +tasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches, +including the full fine-tuning baseline in some scenarios. Additionally, we +empirically show that DEPT grows more efficient as the model size increases. +Our further study reveals that DePT integrates seamlessly with +parameter-efficient transfer learning in the few-shot learning setting and +highlights its adaptability to various model architectures and sizes. + +
+
+ comment: Code is available at https://github.com/ZhengxiangShi/DePT +
+
+
+
+
+ + ☆ PACE: Prompting and Augmentation for Calibrated Confidence Estimation + with GPT-4 in Cloud Incident Root Cause Analysis + + +
+ In recent years, the transition to cloud-based platforms in the IT sector has +emphasized the significance of cloud incident root cause analysis to ensure +service reliability and maintain customer trust. Central to this process is the +efficient determination of root causes, a task made challenging due to the +complex nature of contemporary cloud infrastructures. Despite the proliferation +of AI-driven tools for root cause identification, their applicability remains +limited by the inconsistent quality of their outputs. This paper introduces a +method for enhancing confidence estimation in root cause analysis tools by +prompting retrieval-augmented large language models (LLMs). This approach +operates in two phases. Initially, the model evaluates its confidence based on +historical incident data, considering its assessment of the evidence strength. +Subsequently, the model reviews the root cause generated by the predictor. An +optimization step then combines these evaluations to determine the final +confidence assignment. Experimental results illustrate that our method enables +the model to articulate its confidence effectively, providing a more calibrated +score. We address research questions evaluating the ability of our method to +produce calibrated confidence scores using LLMs, the impact of domain-specific +retrieved examples on confidence estimates, and its potential generalizability +across various root cause analysis models. Through this, we aim to bridge the +confidence estimation gap, aiding on-call engineers in decision-making and +bolstering the efficiency of cloud incident management. + +
+
+
+
+
+ + ☆ Hi Model, generating 'nice' instead of 'good' is not as bad as + generating 'rice'! Towards Context and Semantic Infused Dialogue Generation + Loss Function and Evaluation Metric + + +
+ Over the past two decades, dialogue modeling has made significant strides, +moving from simple rule-based responses to personalized and persuasive response +generation. However, despite these advancements, the objective functions and +evaluation metrics for dialogue generation have remained stagnant, i.e., +cross-entropy and BLEU, respectively. These lexical-based metrics have the +following key limitations: (a) word-to-word matching without semantic +consideration: It assigns the same credit for failure to generate 'nice' and +'rice' for 'good'. (b) missing context attribute for evaluating the generated +response: Even if a generated response is relevant to the ongoing dialogue +context, it may still be penalized for not matching the gold utterance provided +in the corpus. In this paper, we first investigate these limitations +comprehensively and propose a new loss function called Semantic Infused +Contextualized diaLogue (SemTextualLogue) loss function. Furthermore, we +formulate a new evaluation metric called Dialuation, which incorporates both +context relevance and semantic appropriateness while evaluating a generated +response. We conducted experiments on two benchmark dialogue corpora, +encompassing both task-oriented and open-domain scenarios. We found that the +dialogue generation model trained with SemTextualLogue loss attained superior +performance (in both quantitative and qualitative evaluation) compared to the +traditional cross-entropy loss function across the datasets and evaluation +metrics. + +
+
+
+
+
+ + ☆ Large Language Model for Science: A Study on P vs. NP + + +
+ In this work, we use large language models (LLMs) to augment and accelerate +research on the P versus NP problem, one of the most important open problems in +theoretical computer science and mathematics. Specifically, we propose Socratic +reasoning, a general framework that promotes in-depth thinking with LLMs for +complex problem-solving. Socratic reasoning encourages LLMs to recursively +discover, solve, and integrate problems while facilitating self-evaluation and +refinement. Our pilot study on the P vs. NP problem shows that GPT-4 +successfully produces a proof schema and engages in rigorous reasoning +throughout 97 dialogue turns, concluding "P $\neq$ NP", which is in alignment +with (Xu and Zhou, 2023). The investigation uncovers novel insights within the +extensive solution space of LLMs, shedding light on LLM for Science. + +
+
+ comment: 73 pages +
+
+
+
+
+ + ☆ Large Language Models for Compiler Optimization + + +
+ We explore the novel application of Large Language Models to code +optimization. We present a 7B-parameter transformer model trained from scratch +to optimize LLVM assembly for code size. The model takes as input unoptimized +assembly and outputs a list of compiler options to best optimize the program. +Crucially, during training, we ask the model to predict the instruction counts +before and after optimization, and the optimized code itself. These auxiliary +learning tasks significantly improve the optimization performance of the model +and improve the model's depth of understanding. + We evaluate on a large suite of test programs. Our approach achieves a 3.0% +improvement in reducing instruction counts over the compiler, outperforming two +state-of-the-art baselines that require thousands of compilations. Furthermore, +the model shows surprisingly strong code reasoning abilities, generating +compilable code 91% of the time and perfectly emulating the output of the +compiler 70% of the time. + +
+
+
+
+
+ + ♻ ☆ The EarlyBIRD Catches the Bug: On Exploiting Early Layers of Encoder + Models for More Efficient Code Classification + + +
+ The use of modern Natural Language Processing (NLP) techniques has shown to +be beneficial for software engineering tasks, such as vulnerability detection +and type inference. However, training deep NLP models requires significant +computational resources. This paper explores techniques that aim at achieving +the best usage of resources and available information in these models. + We propose a generic approach, EarlyBIRD, to build composite representations +of code from the early layers of a pre-trained transformer model. We +empirically investigate the viability of this approach on the CodeBERT model by +comparing the performance of 12 strategies for creating composite +representations with the standard practice of only using the last encoder +layer. + Our evaluation on four datasets shows that several early layer combinations +yield better performance on defect detection, and some combinations improve +multi-class classification. More specifically, we obtain a +2 average +improvement of detection accuracy on Devign with only 3 out of 12 layers of +CodeBERT and a 3.3x speed-up of fine-tuning. These findings show that early +layers can be used to obtain better results using the same resources, as well +as to reduce resource usage during fine-tuning and inference. + +
+
+ comment: The content in this pre-print is the same as in the CRC accepted for + publication in the ACM Joint European Software Engineering Conference and + Symposium on the Foundations of Software Engineering (ESEC/FSE 2023) +
+
+
+
+
+ + ♻ ☆ Open Problems and Fundamental Limitations of Reinforcement Learning from + Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) is a technique for training +AI systems to align with human goals. RLHF has emerged as the central method +used to finetune state-of-the-art large language models (LLMs). Despite this +popularity, there has been relatively little public work systematizing its +flaws. In this paper, we (1) survey open problems and fundamental limitations +of RLHF and related methods; (2) overview techniques to understand, improve, +and complement RLHF in practice; and (3) propose auditing and disclosure +standards to improve societal oversight of RLHF systems. Our work emphasizes +the limitations of RLHF and highlights the importance of a multi-faceted +approach to the development of safer AI systems. + +
+
+
+
+
+ + ♻ ☆ Task-Based MoE for Multitask Multilingual Machine Translation + + +
+ Mixture-of-experts (MoE) architecture has been proven a powerful method for +diverse tasks in training deep models in many applications. However, current +MoE implementations are task agnostic, treating all tokens from different tasks +in the same manner. In this work, we instead design a novel method that +incorporates task information into MoE models at different granular levels with +shared dynamic task-based adapters. Our experiments and analysis show the +advantages of our approaches over the dense and canonical MoE models on +multi-task multilingual machine translations. With task-specific adapters, our +models can additionally generalize to new tasks efficiently. + +
+
+
+
+
+ + ♻ ☆ Preventing Verbatim Memorization in Language Models Gives a False Sense + of Privacy + + +
+ Studying data memorization in neural language models helps us understand the +risks (e.g., to privacy or copyright) associated with models regurgitating +training data and aids in the development of countermeasures. Many prior works +-- and some recently deployed defenses -- focus on "verbatim memorization", +defined as a model generation that exactly matches a substring from the +training set. We argue that verbatim memorization definitions are too +restrictive and fail to capture more subtle forms of memorization. +Specifically, we design and implement an efficient defense that perfectly +prevents all verbatim memorization. And yet, we demonstrate that this "perfect" +filter does not prevent the leakage of training data. Indeed, it is easily +circumvented by plausible and minimally modified "style-transfer" prompts -- +and in some cases even the non-modified original prompts -- to extract +memorized information. We conclude by discussing potential alternative +definitions and why defining memorization is a difficult yet crucial open +question for neural language models. + +
+
+
+
+
+ + ♻ ☆ Can Deep Neural Networks Predict Data Correlations from Column Names? + + +
+ Recent publications suggest using natural language analysis on database +schema elements to guide tuning and profiling efforts. The underlying +hypothesis is that state-of-the-art language processing methods, so-called +language models, are able to extract information on data properties from schema +text. + This paper examines that hypothesis in the context of data correlation +analysis: is it possible to find column pairs with correlated data by analyzing +their names via language models? First, the paper introduces a novel benchmark +for data correlation analysis, created by analyzing thousands of Kaggle data +sets (and available for download). Second, it uses that data to study the +ability of language models to predict correlation, based on column names. The +analysis covers different language models, various correlation metrics, and a +multitude of accuracy metrics. It pinpoints factors that contribute to +successful predictions, such as the length of column names as well as the ratio +of words. Finally, \rev{the study analyzes the impact of column types on +prediction performance.} The results show that schema text can be a useful +source of information and inform future research efforts, targeted at +NLP-enhanced database tuning and data profiling. + +
+
+
+
+
+ + ♻ ☆ From Probabilistic Programming to Complexity-based Programming ECAI 2023 + + +
+ The paper presents the main characteristics and a preliminary implementation +of a novel computational framework named CompLog. Inspired by probabilistic +programming systems like ProbLog, CompLog builds upon the inferential +mechanisms proposed by Simplicity Theory, relying on the computation of two +Kolmogorov complexities (here implemented as min-path searches via ASP +programs) rather than probabilistic inference. The proposed system enables +users to compute ex-post and ex-ante measures of unexpectedness of a certain +situation, mapping respectively to posterior and prior subjective +probabilities. The computation is based on the specification of world and +mental models by means of causal and descriptive relations between predicates +weighted by complexity. The paper illustrates a few examples of application: +generating relevant descriptions, and providing alternative approaches to +disjunction and to negation. + +
+
+ comment: paper accepted at HYDRA workshop at ECAI 2023 +
+
+
+
+
+ + ♻ ☆ ARNOLD: A Benchmark for Language-Grounded Task Learning With Continuous + States in Realistic 3D Scenes ICCV 2023 + + +
+ Understanding the continuous states of objects is essential for task learning +and planning in the real world. However, most existing task learning benchmarks +assume discrete (e.g., binary) object goal states, which poses challenges for +the learning of complex tasks and transferring learned policy from simulated +environments to the real world. Furthermore, state discretization limits a +robot's ability to follow human instructions based on the grounding of actions +and states. To tackle these challenges, we present ARNOLD, a benchmark that +evaluates language-grounded task learning with continuous states in realistic +3D scenes. ARNOLD is comprised of 8 language-conditioned tasks that involve +understanding object states and learning policies for continuous goals. To +promote language-instructed learning, we provide expert demonstrations with +template-generated language descriptions. We assess task performance by +utilizing the latest language-conditioned policy learning models. Our results +indicate that current models for language-conditioned manipulations continue to +experience significant challenges in novel goal-state generalizations, scene +generalizations, and object generalizations. These findings highlight the need +to develop new algorithms that address this gap and underscore the potential +for further research in this area. Project website: +https://arnold-benchmark.github.io. + +
+
+ comment: The first two authors contributed equally; 20 pages; 17 figures; + project availalbe: https://arnold-benchmark.github.io/ ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Document Understanding Dataset and Evaluation (DUDE) ICCV 2023 + + +
+ We call on the Document AI (DocAI) community to reevaluate current +methodologies and embrace the challenge of creating more practically-oriented +benchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to +remediate the halted research progress in understanding visually-rich documents +(VRDs). We present a new dataset with novelties related to types of questions, +answers, and document layouts based on multi-industry, multi-domain, and +multi-page VRDs of various origins, and dates. Moreover, we are pushing the +boundaries of current methods by creating multi-task and multi-domain +evaluation setups that more accurately simulate real-world situations where +powerful generalization and adaptation under low-resource settings are desired. +DUDE aims to set a new standard as a more practical, long-standing benchmark +for the community, and we hope that it will lead to future extensions and +contributions that address real-world challenges. Finally, our work illustrates +the importance of finding more efficient ways to model language, images, and +layout in DocAI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Deduplicating and Ranking Solution Programs for Suggesting Reference + Solutions + + +
+ Referring to solution programs written by other users is helpful for learners +in programming education. However, current online judge systems just list all +solution programs submitted by users for references, and the programs are +sorted based on the submission date and time, execution time, or user rating, +ignoring to what extent the programs can be helpful to be referenced. In +addition, users struggle to refer to a variety of solution approaches since +there are too many duplicated and near-duplicated programs. To motivate +learners to refer to various solutions to learn better solution approaches, in +this paper, we propose an approach to deduplicate and rank common solution +programs in each programming problem. Inspired by the nature that the +many-duplicated program adopts a more common approach and can be a general +reference, we remove the near-duplicated solution programs and rank the unique +programs based on the duplicate count. The experiments on the solution programs +submitted to a real-world online judge system demonstrate that the number of +programs is reduced by 60.20%, whereas the baseline only reduces by 29.59% +after the deduplication, meaning that users only need to refer to 39.80% of +programs on average. Furthermore, our analysis shows that top-10 ranked +programs cover 29.95% of programs on average, indicating that users can grasp +29.95% of solution approaches by referring to only 10 programs. The proposed +approach shows the potential of reducing the learners' burden of referring to +too many solutions and motivating them to learn a variety of solution +approaches. + +
+
+ comment: 7 pages, 5 figures, accepted to ASSE 2023 +
+
+
+
+
+ + ♻ ☆ GRASS: Unified Generation Model for Speech-to-Semantic Tasks + + +
+ This paper explores the instruction fine-tuning technique for +speech-to-semantic tasks by introducing a unified end-to-end (E2E) framework +that generates target text conditioned on a task-related prompt for audio data. +We pre-train the model using large and diverse data, where instruction-speech +pairs are constructed via a text-to-speech (TTS) system. Extensive experiments +demonstrate that our proposed model achieves state-of-the-art (SOTA) results on +many benchmarks covering speech named entity recognition, speech sentiment +analysis, speech question answering, and more, after fine-tuning. Furthermore, +the proposed model achieves competitive performance in zero-shot and few-shot +scenarios. To facilitate future work on instruction fine-tuning for +speech-to-semantic tasks, we release our instruction dataset and code. + +
+
+
+
+
+ + ♻ ☆ MLLM-DataEngine: An Iterative Refinement Approach for MLLM + + +
+ Despite the great advance of Multimodal Large Language Models (MLLMs) in both +instruction dataset building and benchmarking, the independence of training and +evaluation makes current MLLMs hard to further improve their capability under +the guidance of evaluation results with a relatively low human cost. In this +paper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data +generation, model training, and evaluation. Within each loop iteration, the +MLLM-DataEngine first analyze the weakness of the model based on the evaluation +results, then generate a proper incremental dataset for the next training +iteration and enhance the model capability iteratively. Compared with previous +data collection methods which are separate from the benchmarking, the data +generated by MLLM-DataEngine shows better targeting, quality, and correctness. +For targeting, we propose an Adaptive Bad-case Sampling module, which adjusts +the ratio of different types of data within each incremental dataset based on +the benchmarking results. For quality, we resort to GPT-4 to generate +high-quality data with each given data type. For correctness, prompt design is +critical for the data generation results. Rather than previous hand-crafted +prompt, we propose an Interactive Prompt Optimization strategy, which optimizes +the prompt with the multi-round interaction between human and GPT, and improve +the correctness of generated data greatly. Through extensive experiments, we +find our MLLM-DataEngine could boost the MLLM capability in a targeted and +automatic manner, with only a few human participation. We hope it could be a +general solution for the following MLLMs building. The MLLM-DataEngine has been +open-sourced and is now available at +https://github.com/opendatalab/MLLM-DataEngine. + +
+
+ comment: Code and models are available at + https://github.com/opendatalab/MLLM-DataEngine +
+
+
+
+
+ + ♻ ☆ Applying QNLP to sentiment analysis in finance + + +
+ As an application domain where the slightest qualitative improvements can +yield immense value, finance is a promising candidate for early quantum +advantage. Focusing on the rapidly advancing field of Quantum Natural Language +Processing (QNLP), we explore the practical applicability of the two central +approaches DisCoCat and Quantum-Enhanced Long Short-Term Memory (QLSTM) to the +problem of sentiment analysis in finance. Utilizing a novel ChatGPT-based data +generation approach, we conduct a case study with more than 1000 realistic +sentences and find that QLSTMs can be trained substantially faster than +DisCoCat while also achieving close to classical results for their available +software implementations. + +
+
+
+
+
+ + ♻ ☆ $α$-$β$-Factorization and the Binary Case of Simon's Congruence + + +
+ In 1991 H\'ebrard introduced a factorization of words that turned out to be a +powerful tool for the investigation of a word's scattered factors (also known +as (scattered) subwords or subsequences). Based on this, first Karandikar and +Schnoebelen introduced the notion of $k$-richness and later on Barker et al. +the notion of $k$-universality. In 2022 Fleischmann et al. presented a +generalization of the arch factorization by intersecting the arch factorization +of a word and its reverse. While the authors merely used this factorization for +the investigation of shortest absent scattered factors, in this work we +investigate this new $\alpha$-$\beta$-factorization as such. We characterize +the famous Simon congruence of $k$-universal words in terms of $1$-universal +words. Moreover, we apply these results to binary words. In this special case, +we obtain a full characterization of the classes and calculate the index of the +congruence. Lastly, we start investigating the ternary case, present a full +list of possibilities for $\alpha\beta\alpha$-factors, and characterize their +congruence. + +
+
+
+
+
+ + ♻ ☆ Self-Edit: Fault-Aware Code Editor for Code Generation ACL2023 + + +
+ Large language models (LLMs) have demonstrated an impressive ability to +generate codes on competitive programming tasks. However, with limited sample +numbers, LLMs still suffer from poor accuracy. Inspired by the process of human +programming, we propose a generate-and-edit approach named Self-Edit that +utilizes execution results of the generated code from LLMs to improve the code +quality on the competitive programming task. We execute the generated code on +the example test case provided in the question and wrap execution results into +a supplementary comment. Utilizing this comment as guidance, our fault-aware +code editor is employed to correct errors in the generated code. We perform +extensive evaluations across two competitive programming datasets with nine +different LLMs. Compared to directly generating from LLMs, our approach can +improve the average of pass@1 by 89\% on APPS-dev, 31\% on APPS-test, and 48\% +on HumanEval over nine popular code generation LLMs with parameter sizes +ranging from 110M to 175B. Compared to other post-processing methods, our +method demonstrates superior accuracy and efficiency. + +
+
+ comment: Accepted by ACL2023 +
+
+
+
+
+ + ♻ ☆ Financial News Analytics Using Fine-Tuned Llama 2 GPT Model + + +
+ The paper considers the possibility to fine-tune Llama 2 GPT large language +model (LLM) for the multitask analysis of financial news. For fine-tuning, the +PEFT/LoRA based approach was used. In the study, the model was fine-tuned for +the following tasks: analysing a text from financial market perspectives, +highlighting main points of a text, summarizing a text and extracting named +entities with appropriate sentiments. The obtained results show that the +fine-tuned Llama 2 model can perform a multitask financial news analysis with a +specified structure of response, part of response can be a structured text and +another part of data can have JSON format for further processing. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models with quantitative target variables. + +
+
+
+
+
+ + ♻ ☆ CSPRD: A Financial Policy Retrieval Dataset for Chinese Stock Market + + +
+ In recent years, great advances in pre-trained language models (PLMs) have +sparked considerable research focus and achieved promising performance on the +approach of dense passage retrieval, which aims at retrieving relative passages +from massive corpus with given questions. However, most of existing datasets +mainly benchmark the models with factoid queries of general commonsense, while +specialised fields such as finance and economics remain unexplored due to the +deficiency of large-scale and high-quality datasets with expert annotations. In +this work, we propose a new task, policy retrieval, by introducing the Chinese +Stock Policy Retrieval Dataset (CSPRD), which provides 700+ prospectus passages +labeled by experienced experts with relevant articles from 10k+ entries in our +collected Chinese policy corpus. Experiments on lexical, embedding and +fine-tuned bi-encoder models show the effectiveness of our proposed CSPRD yet +also suggests ample potential for improvement. Our best performing baseline +achieves 56.1% MRR@10, 28.5% NDCG@10, 37.5% Recall@10 and 80.6% Precision@10 on +dev set. + +
+
+
+
+
+ + ♻ ☆ Interdisciplinary Fairness in Imbalanced Research Proposal Topic + Inference: A Hierarchical Transformer-based Method with Selective + Interpolation + + +
+ The objective of topic inference in research proposals aims to obtain the +most suitable disciplinary division from the discipline system defined by a +funding agency. The agency will subsequently find appropriate peer review +experts from their database based on this division. Automated topic inference +can reduce human errors caused by manual topic filling, bridge the knowledge +gap between funding agencies and project applicants, and improve system +efficiency. Existing methods focus on modeling this as a hierarchical +multi-label classification problem, using generative models to iteratively +infer the most appropriate topic information. However, these methods overlook +the gap in scale between interdisciplinary research proposals and +non-interdisciplinary ones, leading to an unjust phenomenon where the automated +inference system categorizes interdisciplinary proposals as +non-interdisciplinary, causing unfairness during the expert assignment. How can +we address this data imbalance issue under a complex discipline system and +hence resolve this unfairness? In this paper, we implement a topic label +inference system based on a Transformer encoder-decoder architecture. +Furthermore, we utilize interpolation techniques to create a series of +pseudo-interdisciplinary proposals from non-interdisciplinary ones during +training based on non-parametric indicators such as cross-topic probabilities +and topic occurrence probabilities. This approach aims to reduce the bias of +the system during model training. Finally, we conduct extensive experiments on +a real-world dataset to verify the effectiveness of the proposed method. The +experimental results demonstrate that our training strategy can significantly +mitigate the unfairness generated in the topic inference task. + +
+
+ comment: 19 pages, Under review. arXiv admin note: text overlap with + arXiv:2209.13912 +
+
+
+
+
+ + ♻ ☆ What can we know about that which we cannot even imagine? + + +
+ In this essay I will consider a sequence of questions. The first questions +concern the biological function of intelligence in general, and cognitive +prostheses of human intelligence in particular. These will lead into questions +concerning human language, perhaps the most important cognitive prosthesis +humanity has ever developed. While it is traditional to rhapsodize about the +cognitive power encapsulated in human language, I will emphasize how horribly +limited human language is -- and therefore how limited our cognitive abilities +are, despite their being augmented with language. This will lead to questions +of whether human mathematics, being ultimately formulated in terms of human +language, is also deeply limited. I will then combine these questions to pose a +partial, sort-of, sideways answer to the guiding concern of this essay: what we +can ever discern about that we cannot even conceive? + +
+
+ comment: 38 pages, 9 pages are references +
+
+
+
+
+ + ♻ ☆ FonMTL: Towards Multitask Learning for the Fon Language EMNLP 2023 + + +
+ The Fon language, spoken by an average 2 million of people, is a truly +low-resourced African language, with a limited online presence, and existing +datasets (just to name but a few). Multitask learning is a learning paradigm +that aims to improve the generalization capacity of a model by sharing +knowledge across different but related tasks: this could be prevalent in very +data-scarce scenarios. In this paper, we present the first explorative approach +to multitask learning, for model capabilities enhancement in Natural Language +Processing for the Fon language. Specifically, we explore the tasks of Named +Entity Recognition (NER) and Part of Speech Tagging (POS) for Fon. We leverage +two language model heads as encoders to build shared representations for the +inputs, and we use linear layers blocks for classification relative to each +task. Our results on the NER and POS tasks for Fon, show competitive (or +better) performances compared to several multilingual pretrained language +models finetuned on single tasks. Additionally, we perform a few ablation +studies to leverage the efficiency of two different loss combination strategies +and find out that the equal loss weighting approach works best in our case. Our +code is open-sourced at https://github.com/bonaventuredossou/multitask_fon. + +
+
+ comment: Accepted at WiNLP workshop, co-located at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Towards training Bilingual and Code-Switched Speech Recognition models + from Monolingual data sources + + +
+ Multilingual Automatic Speech Recognition (ASR) models are capable of +transcribing audios across multiple languages, eliminating the need for +separate models. In addition, they can perform Language Identification (LID) +and handle code-switched speech. However, training these models requires +special code-switch and multilingual speech corpora which are sparsely +available. In this paper, we evaluate different approaches towards training of +bilingual as well as code-switched ASR models using purely monolingual data +sources. We introduce the concept of aggregate tokenizers that differs from the +current prevalent technique of generating LIDs at the boundaries of monolingual +samples and produces LID for each emitted token instead. We compare bilingual +and monolingual model performance, showcase the efficacy of aggregate +tokenizers, present a synthetic code-switched ASR data generation technique and +demonstrate the effectiveness of the proposed code-switched ASR models for the +tasks of speech recognition and spoken language identification. + +
+
+
+
+
+ + ♻ ☆ Think Before You Speak: Explicitly Generating Implicit Commonsense + Knowledge for Response Generation ACL 2022 + + +
+ Implicit knowledge, such as common sense, is key to fluid human +conversations. Current neural response generation (RG) models are trained to +generate responses directly, omitting unstated implicit knowledge. In this +paper, we present Think-Before-Speaking (TBS), a generative approach to first +externalize implicit commonsense knowledge (think) and use this knowledge to +generate responses (speak). We expect that externalizing implicit knowledge +allows more efficient learning, produces more informative responses, and +enables more explainable models. We analyze different choices to collect +knowledge-aligned dialogues, represent implicit knowledge, and transition +between knowledge and dialogues. Empirical results show TBS models outperform +end-to-end and knowledge-augmented RG baselines on most automatic metrics and +generate more informative, specific, and commonsense-following responses, as +evaluated by human annotators. TBS also generates knowledge that makes sense +and is relevant to the dialogue around 85\% of the time. + +
+
+ comment: Accepted at ACL 2022 main conference. 16 pages, 9 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ♻ ☆ ModuleFormer: Modularity Emerges from Mixture-of-Experts + + +
+ Large Language Models (LLMs) have achieved remarkable results. However, +existing models are expensive to train and deploy, and it is also difficult to +expand their knowledge beyond pre-training data without forgetting previous +knowledge. This paper proposes a new neural network architecture, ModuleFormer, +that leverages modularity to improve the efficiency and flexibility of large +language models. ModuleFormer is based on the Sparse Mixture of Experts (SMoE). +Unlike the previous SMoE-based modular language model, which requires +domain-labeled data to learn domain-specific experts, ModuleFormer can induce +modularity from uncurated data with its new load balancing and concentration +losses. ModuleFormer is a modular architecture that includes two different +types of modules: new stick-breaking attention heads and feedforward experts. +Different modules are sparsely activated conditions on the input token during +training and inference. In our experiment, we found that the modular +architecture enables three important abilities for large pre-trained language +models: 1) Efficiency, since ModuleFormer only activates a subset of its +modules for each input token, thus it could achieve the same performance as +dense LLMs with more than two times throughput; 2) Extendability, ModuleFormer +is more immune to catastrophic forgetting than dense LLMs and can be easily +extended with new modules to learn new knowledge that is not included in the +training data; 3) Specialisation, finetuning ModuleFormer could specialize a +subset of modules to the finetuning task and the task-unrelated modules could +be easily pruned for a lightweight deployment. + +
+
+
+
+
+ + ♻ ☆ PaLM 2 Technical Report + + +
+ We introduce PaLM 2, a new state-of-the-art language model that has better +multilingual and reasoning capabilities and is more compute-efficient than its +predecessor PaLM. PaLM 2 is a Transformer-based model trained using a mixture +of objectives. Through extensive evaluations on English and multilingual +language, and reasoning tasks, we demonstrate that PaLM 2 has significantly +improved quality on downstream tasks across different model sizes, while +simultaneously exhibiting faster and more efficient inference compared to PaLM. +This improved efficiency enables broader deployment while also allowing the +model to respond faster, for a more natural pace of interaction. PaLM 2 +demonstrates robust reasoning capabilities exemplified by large improvements +over PaLM on BIG-Bench and other reasoning tasks. PaLM 2 exhibits stable +performance on a suite of responsible AI evaluations, and enables +inference-time control over toxicity without additional overhead or impact on +other capabilities. Overall, PaLM 2 achieves state-of-the-art performance +across a diverse set of tasks and capabilities. + When discussing the PaLM 2 family, it is important to distinguish between +pre-trained models (of various sizes), fine-tuned variants of these models, and +the user-facing products that use these models. In particular, user-facing +products typically include additional pre- and post-processing steps. +Additionally, the underlying models may evolve over time. Therefore, one should +not expect the performance of user-facing products to exactly match the results +reported in this report. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 123 + +
+
+
+ + ☆ Robot Parkour Learning + + +
+ Parkour is a grand challenge for legged locomotion that requires robots to +overcome various obstacles rapidly in complex environments. Existing methods +can generate either diverse but blind locomotion skills or vision-based but +specialized skills by using reference animal data or complex rewards. However, +autonomous parkour requires robots to learn generalizable skills that are both +vision-based and diverse to perceive and react to various scenarios. In this +work, we propose a system for learning a single end-to-end vision-based parkour +policy of diverse parkour skills using a simple reward without any reference +motion data. We develop a reinforcement learning method inspired by direct +collocation to generate parkour skills, including climbing over high obstacles, +leaping over large gaps, crawling beneath low barriers, squeezing through thin +slits, and running. We distill these skills into a single vision-based parkour +policy and transfer it to a quadrupedal robot using its egocentric depth +camera. We demonstrate that our system can empower two different low-cost +robots to autonomously select and execute appropriate parkour skills to +traverse challenging real-world environments. + +
+
+ comment: CoRL 2023 (Oral). Project website at https://robot-parkour.github.io +
+
+
+
+
+ + ☆ Diffusion-Guided Reconstruction of Everyday Hand-Object Interaction + Clips ICCV23 + + +
+ We tackle the task of reconstructing hand-object interactions from short +video clips. Given an input video, our approach casts 3D inference as a +per-video optimization and recovers a neural 3D representation of the object +shape, as well as the time-varying motion and hand articulation. While the +input video naturally provides some multi-view cues to guide 3D inference, +these are insufficient on their own due to occlusions and limited viewpoint +variations. To obtain accurate 3D, we augment the multi-view signals with +generic data-driven priors to guide reconstruction. Specifically, we learn a +diffusion network to model the conditional distribution of (geometric) +renderings of objects conditioned on hand configuration and category label, and +leverage it as a prior to guide the novel-view renderings of the reconstructed +scene. We empirically evaluate our approach on egocentric videos across 6 +object categories, and observe significant improvements over prior single-view +and multi-view methods. Finally, we demonstrate our system's ability to +reconstruct arbitrary clips from YouTube, showing both 1st and 3rd person +interactions. + +
+
+ comment: Accepted to ICCV23 (Oral). Project Page: + https://judyye.github.io/diffhoi-www/ +
+
+
+
+
+ + ☆ ViHOPE: Visuotactile In-Hand Object 6D Pose Estimation with Shape + Completion + + +
+ In this letter, we introduce ViHOPE, a novel framework for estimating the 6D +pose of an in-hand object using visuotactile perception. Our key insight is +that the accuracy of the 6D object pose estimate can be improved by explicitly +completing the shape of the object. To this end, we introduce a novel +visuotactile shape completion module that uses a conditional Generative +Adversarial Network to complete the shape of an in-hand object based on +volumetric representation. This approach improves over prior works that +directly regress visuotactile observations to a 6D pose. By explicitly +completing the shape of the in-hand object and jointly optimizing the shape +completion and pose estimation tasks, we improve the accuracy of the 6D object +pose estimate. We train and test our model on a synthetic dataset and compare +it with the state-of-the-art. In the visuotactile shape completion task, we +outperform the state-of-the-art by 265% using the Intersection of Union metric +and achieve 88% lower Chamfer Distance. In the visuotactile pose estimation +task, we present results that suggest our framework reduces position and +angular errors by 35% and 64%, respectively. Furthermore, we ablate our +framework to confirm the gain on the 6D object pose estimate from explicitly +completing the shape. Ultimately, we show that our framework produces models +that are robust to sim-to-real transfer on a real-world robot platform. + +
+
+ comment: Accepted by RA-L +
+
+
+
+
+ + ☆ An Effective Two-stage Training Paradigm Detector for Small Dataset + + +
+ Learning from the limited amount of labeled data to the pre-train model has +always been viewed as a challenging task. In this report, an effective and +robust solution, the two-stage training paradigm YOLOv8 detector (TP-YOLOv8), +is designed for the object detection track in VIPriors Challenge 2023. First, +the backbone of YOLOv8 is pre-trained as the encoder using the masked image +modeling technique. Then the detector is fine-tuned with elaborate +augmentations. During the test stage, test-time augmentation (TTA) is used to +enhance each model, and weighted box fusion (WBF) is implemented to further +boost the performance. With the well-designed structure, our approach has +achieved 30.4% average precision from 0.50 to 0.95 on the DelftBikes test set, +ranking 4th on the leaderboard. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ CitDet: A Benchmark Dataset for Citrus Fruit Detection + + +
+ In this letter, we present a new dataset to advance the state of the art in +detecting citrus fruit and accurately estimate yield on trees affected by the +Huanglongbing (HLB) disease in orchard environments via imaging. Despite the +fact that significant progress has been made in solving the fruit detection +problem, the lack of publicly available datasets has complicated direct +comparison of results. For instance, citrus detection has long been of interest +in the agricultural research community, yet there is an absence of work, +particularly involving public datasets of citrus affected by HLB. To address +this issue, we enhance state-of-the-art object detection methods for use in +typical orchard settings. Concretely, we provide high-resolution images of +citrus trees located in an area known to be highly affected by HLB, along with +high-quality bounding box annotations of citrus fruit. Fruit on both the trees +and the ground are labeled to allow for identification of fruit location, which +contributes to advancements in yield estimation and potential measure of HLB +impact via fruit drop. The dataset consists of over 32,000 bounding box +annotations for fruit instances contained in 579 high-resolution images. In +summary, our contributions are the following: (i) we introduce a novel dataset +along with baseline performance benchmarks on multiple contemporary object +detection algorithms, (ii) we show the ability to accurately capture fruit +location on tree or on ground, and finally (ii) we present a correlation of our +results with yield estimations. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters (RA-L) +
+
+
+
+
+ + ☆ Learning the Geodesic Embedding with Graph Neural Networks SIGGRAPH + + +
+ We present GeGnn, a learning-based method for computing the approximate +geodesic distance between two arbitrary points on discrete polyhedra surfaces +with constant time complexity after fast precomputation. Previous relevant +methods either focus on computing the geodesic distance between a single source +and all destinations, which has linear complexity at least or require a long +precomputation time. Our key idea is to train a graph neural network to embed +an input mesh into a high-dimensional embedding space and compute the geodesic +distance between a pair of points using the corresponding embedding vectors and +a lightweight decoding function. To facilitate the learning of the embedding, +we propose novel graph convolution and graph pooling modules that incorporate +local geodesic information and are verified to be much more effective than +previous designs. After training, our method requires only one forward pass of +the network per mesh as precomputation. Then, we can compute the geodesic +distance between a pair of points using our decoding function, which requires +only several matrix multiplications and can be massively parallelized on GPUs. +We verify the efficiency and effectiveness of our method on ShapeNet and +demonstrate that our method is faster than existing methods by orders of +magnitude while achieving comparable or better accuracy. Additionally, our +method exhibits robustness on noisy and incomplete meshes and strong +generalization ability on out-of-distribution meshes. The code and pretrained +model can be found on https://github.com/IntelligentGeometry/GeGnn. + +
+
+ comment: SIGGRAPH Asia 2023, Journal Track +
+
+
+
+
+ + ☆ Temporal Action Localization with Enhanced Instant Discriminability CVPR + + +
+ Temporal action detection (TAD) aims to detect all action boundaries and +their corresponding categories in an untrimmed video. The unclear boundaries of +actions in videos often result in imprecise predictions of action boundaries by +existing methods. To resolve this issue, we propose a one-stage framework named +TriDet. First, we propose a Trident-head to model the action boundary via an +estimated relative probability distribution around the boundary. Then, we +analyze the rank-loss problem (i.e. instant discriminability deterioration) in +transformer-based methods and propose an efficient scalable-granularity +perception (SGP) layer to mitigate this issue. To further push the limit of +instant discriminability in the video backbone, we leverage the strong +representation capability of pretrained large models and investigate their +performance on TAD. Last, considering the adequate spatial-temporal context for +classification, we design a decoupled feature pyramid network with separate +feature pyramids to incorporate rich spatial context from the large model for +localization. Experimental results demonstrate the robustness of TriDet and its +state-of-the-art performance on multiple TAD datasets, including hierarchical +(multilabel) TAD datasets. + +
+
+ comment: An extended version of the CVPR paper arXiv:2303.07347, submitted to + IJCV +
+
+
+
+
+ + ☆ UniSeg: A Unified Multi-Modal LiDAR Segmentation Network and the + OpenPCSeg Codebase ICCV 2023 + + +
+ Point-, voxel-, and range-views are three representative forms of point +clouds. All of them have accurate 3D measurements but lack color and texture +information. RGB images are a natural complement to these point cloud views and +fully utilizing the comprehensive information of them benefits more robust +perceptions. In this paper, we present a unified multi-modal LiDAR segmentation +network, termed UniSeg, which leverages the information of RGB images and three +views of the point cloud, and accomplishes semantic segmentation and panoptic +segmentation simultaneously. Specifically, we first design the Learnable +cross-Modal Association (LMA) module to automatically fuse voxel-view and +range-view features with image features, which fully utilize the rich semantic +information of images and are robust to calibration errors. Then, the enhanced +voxel-view and range-view features are transformed to the point space,where +three views of point cloud features are further fused adaptively by the +Learnable cross-View Association module (LVA). Notably, UniSeg achieves +promising results in three public benchmarks, i.e., SemanticKITTI, nuScenes, +and Waymo Open Dataset (WOD); it ranks 1st on two challenges of two benchmarks, +including the LiDAR semantic segmentation challenge of nuScenes and panoptic +segmentation challenges of SemanticKITTI. Besides, we construct the OpenPCSeg +codebase, which is the largest and most comprehensive outdoor LiDAR +segmentation codebase. It contains most of the popular outdoor LiDAR +segmentation algorithms and provides reproducible implementations. The +OpenPCSeg codebase will be made publicly available at +https://github.com/PJLab-ADG/PCSeg. + +
+
+ comment: ICCV 2023; 21 pages; 9 figures; 18 tables; Code at + https://github.com/PJLab-ADG/PCSeg +
+
+
+
+
+ + ☆ ITI-GEN: Inclusive Text-to-Image Generation ICCV 2023 + + +
+ Text-to-image generative models often reflect the biases of the training +data, leading to unequal representations of underrepresented groups. This study +investigates inclusive text-to-image generative models that generate images +based on human-written prompts and ensure the resulting images are uniformly +distributed across attributes of interest. Unfortunately, directly expressing +the desired attributes in the prompt often leads to sub-optimal results due to +linguistic ambiguity or model misrepresentation. Hence, this paper proposes a +drastically different approach that adheres to the maxim that "a picture is +worth a thousand words". We show that, for some attributes, images can +represent concepts more expressively than text. For instance, categories of +skin tones are typically hard to specify by text but can be easily represented +by example images. Building upon these insights, we propose a novel approach, +ITI-GEN, that leverages readily available reference images for Inclusive +Text-to-Image GENeration. The key idea is learning a set of prompt embeddings +to generate images that can effectively represent all desired attribute +categories. More importantly, ITI-GEN requires no model fine-tuning, making it +computationally efficient to augment existing text-to-image models. Extensive +experiments demonstrate that ITI-GEN largely improves over state-of-the-art +models to generate inclusive images from a prompt. Project page: +https://czhang0528.github.io/iti-gen. + +
+
+ comment: Accepted to ICCV 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ OpenFashionCLIP: Vision-and-Language Contrastive Learning with + Open-Source Fashion Data + + +
+ The inexorable growth of online shopping and e-commerce demands scalable and +robust machine learning-based solutions to accommodate customer requirements. +In the context of automatic tagging classification and multimodal retrieval, +prior works either defined a low generalizable supervised learning approach or +more reusable CLIP-based techniques while, however, training on closed source +data. In this work, we propose OpenFashionCLIP, a vision-and-language +contrastive learning method that only adopts open-source fashion data stemming +from diverse domains, and characterized by varying degrees of specificity. Our +approach is extensively validated across several tasks and benchmarks, and +experimental results highlight a significant out-of-domain generalization +capability and consistent improvements over state-of-the-art methods both in +terms of accuracy and recall. Source code and trained models are publicly +available at: https://github.com/aimagelab/open-fashion-clip. + +
+
+ comment: International Conference on Image Analysis and Processing (ICIAP) + 2023 +
+
+
+
+
+ + ☆ Distance-Aware eXplanation Based Learning ICTAI 2023 + + +
+ eXplanation Based Learning (XBL) is an interactive learning approach that +provides a transparent method of training deep learning models by interacting +with their explanations. XBL augments loss functions to penalize a model based +on deviation of its explanations from user annotation of image features. The +literature on XBL mostly depends on the intersection of visual model +explanations and image feature annotations. We present a method to add a +distance-aware explanation loss to categorical losses that trains a learner to +focus on important regions of a training dataset. Distance is an appropriate +approach for calculating explanation loss since visual model explanations such +as Gradient-weighted Class Activation Mapping (Grad-CAMs) are not strictly +bounded as annotations and their intersections may not provide complete +information on the deviation of a model's focus from relevant image regions. In +addition to assessing our model using existing metrics, we propose an +interpretability metric for evaluating visual feature-attribution based model +explanations that is more informative of the model's performance than existing +metrics. We demonstrate performance of our proposed method on three image +classification tasks. + +
+
+ comment: Accepted at the 35th IEEE International Conference on Tools with + Artificial Intelligence, ICTAI 2023 +
+
+
+
+
+ + ☆ PAI-Diffusion: Constructing and Serving a Family of Open Chinese + Diffusion Models for Text-to-image Synthesis on the Cloud + + +
+ Text-to-image synthesis for the Chinese language poses unique challenges due +to its large vocabulary size, and intricate character relationships. While +existing diffusion models have shown promise in generating images from textual +descriptions, they often neglect domain-specific contexts and lack robustness +in handling the Chinese language. This paper introduces PAI-Diffusion, a +comprehensive framework that addresses these limitations. PAI-Diffusion +incorporates both general and domain-specific Chinese diffusion models, +enabling the generation of contextually relevant images. It explores the +potential of using LoRA and ControlNet for fine-grained image style transfer +and image editing, empowering users with enhanced control over image +generation. Moreover, PAI-Diffusion seamlessly integrates with Alibaba Cloud's +Machine Learning Platform for AI, providing accessible and scalable solutions. +All the Chinese diffusion model checkpoints, LoRAs, and ControlNets, including +domain-specific ones, are publicly available. A user-friendly Chinese WebUI and +the diffusers-api elastic inference toolkit, also open-sourced, further +facilitate the easy deployment of PAI-Diffusion models in various environments, +making it a valuable resource for Chinese text-to-image synthesis. + +
+
+
+
+
+ + ☆ On the detection of Out-Of-Distribution samples in Multiple Instance + Learning + + +
+ The deployment of machine learning solutions in real-world scenarios often +involves addressing the challenge of out-of-distribution (OOD) detection. While +significant efforts have been devoted to OOD detection in classical supervised +settings, the context of weakly supervised learning, particularly the Multiple +Instance Learning (MIL) framework, remains under-explored. In this study, we +tackle this challenge by adapting post-hoc OOD detection methods to the MIL +setting while introducing a novel benchmark specifically designed to assess OOD +detection performance in weakly supervised scenarios. Extensive experiments +based on diverse public datasets do not reveal a single method with a clear +advantage over the others. Although DICE emerges as the best-performing method +overall, it exhibits significant shortcomings on some datasets, emphasizing the +complexity of this under-explored and challenging topic. Our findings shed +light on the complex nature of OOD detection under the MIL framework, +emphasizing the importance of developing novel, robust, and reliable methods +that can generalize effectively in a weakly supervised context. The code for +the paper is available here: https://github.com/loic-lb/OOD_MIL. + +
+
+
+
+
+ + ☆ ReSimAD: Zero-Shot 3D Domain Transfer for Autonomous Driving with Source + Reconstruction and Target Simulation + + +
+ Domain shifts such as sensor type changes and geographical situation +variations are prevalent in Autonomous Driving (AD), which poses a challenge +since AD model relying on the previous-domain knowledge can be hardly directly +deployed to a new domain without additional costs. In this paper, we provide a +new perspective and approach of alleviating the domain shifts, by proposing a +Reconstruction-Simulation-Perception (ReSimAD) scheme. Specifically, the +implicit reconstruction process is based on the knowledge from the previous old +domain, aiming to convert the domain-related knowledge into domain-invariant +representations, \textit{e.g.}, 3D scene-level meshes. Besides, the point +clouds simulation process of multiple new domains is conditioned on the above +reconstructed 3D meshes, where the target-domain-like simulation samples can be +obtained, thus reducing the cost of collecting and annotating new-domain data +for the subsequent perception process. For experiments, we consider different +cross-domain situations such as Waymo-to-KITTI, Waymo-to-nuScenes, +Waymo-to-ONCE, \textit{etc}, to verify the \textbf{zero-shot} target-domain +perception using ReSimAD. Results demonstrate that our method is beneficial to +boost the domain generalization ability, even promising for 3D pre-training. + +
+
+ comment: Code and simulated points are available at + https://github.com/PJLab-ADG/3DTrans\#resimad +
+
+
+
+
+ + ☆ Stream-based Active Learning by Exploiting Temporal Properties in + Perception with Temporal Predicted Loss + + +
+ Active learning (AL) reduces the amount of labeled data needed to train a +machine learning model by intelligently choosing which instances to label. +Classic pool-based AL requires all data to be present in a datacenter, which +can be challenging with the increasing amounts of data needed in deep learning. +However, AL on mobile devices and robots, like autonomous cars, can filter the +data from perception sensor streams before reaching the datacenter. We +exploited the temporal properties for such image streams in our work and +proposed the novel temporal predicted loss (TPL) method. To evaluate the +stream-based setting properly, we introduced the GTA V streets and the A2D2 +streets dataset and made both publicly available. Our experiments showed that +our approach significantly improves the diversity of the selection while being +an uncertainty-based method. As pool-based approaches are more common in +perception applications, we derived a concept for comparing pool-based and +stream-based AL, where TPL out-performed state-of-the-art pool- or stream-based +approaches for different models. TPL demonstrated a gain of 2.5 precept points +(pp) less required data while being significantly faster than pool-based +methods. + +
+
+
+
+
+ + ☆ Zero-Shot Co-salient Object Detection Framework + + +
+ Co-salient Object Detection (CoSOD) endeavors to replicate the human visual +system's capacity to recognize common and salient objects within a collection +of images. Despite recent advancements in deep learning models, these models +still rely on training with well-annotated CoSOD datasets. The exploration of +training-free zero-shot CoSOD frameworks has been limited. In this paper, +taking inspiration from the zero-shot transfer capabilities of foundational +computer vision models, we introduce the first zero-shot CoSOD framework that +harnesses these models without any training process. To achieve this, we +introduce two novel components in our proposed framework: the group prompt +generation (GPG) module and the co-saliency map generation (CMP) module. We +evaluate the framework's performance on widely-used datasets and observe +impressive results. Our approach surpasses existing unsupervised methods and +even outperforms fully supervised methods developed before 2020, while +remaining competitive with some fully supervised methods developed before 2022. + +
+
+
+
+
+ + ☆ Learning Semantic Segmentation with Query Points Supervision on Aerial + Images ICCV 2023 + + +
+ Semantic segmentation is crucial in remote sensing, where high-resolution +satellite images are segmented into meaningful regions. Recent advancements in +deep learning have significantly improved satellite image segmentation. +However, most of these methods are typically trained in fully supervised +settings that require high-quality pixel-level annotations, which are expensive +and time-consuming to obtain. In this work, we present a weakly supervised +learning algorithm to train semantic segmentation algorithms that only rely on +query point annotations instead of full mask labels. Our proposed approach +performs accurate semantic segmentation and improves efficiency by +significantly reducing the cost and time required for manual annotation. +Specifically, we generate superpixels and extend the query point labels into +those superpixels that group similar meaningful semantics. Then, we train +semantic segmentation models, supervised with images partially labeled with the +superpixels pseudo-labels. We benchmark our weakly supervised training approach +on an aerial image dataset and different semantic segmentation architectures, +showing that we can reach competitive performance compared to fully supervised +training while reducing the annotation effort. + +
+
+ comment: Paper presented at the LXCV workshop at ICCV 2023 +
+
+
+
+
+ + ☆ Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal + Retrieval + + +
+ Current research on cross-modal retrieval is mostly English-oriented, as the +availability of a large number of English-oriented human-labeled +vision-language corpora. In order to break the limit of non-English labeled +data, cross-lingual cross-modal retrieval (CCR) has attracted increasing +attention. Most CCR methods construct pseudo-parallel vision-language corpora +via Machine Translation (MT) to achieve cross-lingual transfer. However, the +translated sentences from MT are generally imperfect in describing the +corresponding visual contents. Improperly assuming the pseudo-parallel data are +correctly correlated will make the networks overfit to the noisy +correspondence. Therefore, we propose Dual-view Curricular Optimal Transport +(DCOT) to learn with noisy correspondence in CCR. In particular, we quantify +the confidence of the sample pair correlation with optimal transport theory +from both the cross-lingual and cross-modal views, and design dual-view +curriculum learning to dynamically model the transportation costs according to +the learning stage of the two views. Extensive experiments are conducted on two +multilingual image-text datasets and one video-text dataset, and the results +demonstrate the effectiveness and robustness of the proposed method. Besides, +our proposed method also shows a good expansibility to cross-lingual image-text +baselines and a decent generalization on out-of-domain data. + +
+
+
+
+
+ + ☆ Panoptic Vision-Language Feature Fields + + +
+ Recently, methods have been proposed for 3D open-vocabulary semantic +segmentation. Such methods are able to segment scenes into arbitrary classes +given at run-time using their text description. In this paper, we propose to +our knowledge the first algorithm for open-vocabulary panoptic segmentation, +simultaneously performing both semantic and instance segmentation. Our +algorithm, Panoptic Vision-Language Feature Fields (PVLFF) learns a feature +field of the scene, jointly learning vision-language features and hierarchical +instance features through a contrastive loss function from 2D instance segment +proposals on input frames. Our method achieves comparable performance against +the state-of-the-art close-set 3D panoptic systems on the HyperSim, ScanNet and +Replica dataset and outperforms current 3D open-vocabulary systems in terms of +semantic segmentation. We additionally ablate our method to demonstrate the +effectiveness of our model architecture. Our code will be available at +https://github.com/ethz-asl/autolabel. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ A Localization-to-Segmentation Framework for Automatic Tumor + Segmentation in Whole-Body PET/CT Images + + +
+ Fluorodeoxyglucose (FDG) positron emission tomography(PET) combined with +computed tomography (CT) is considered the primary solution for detecting some +cancers, such as lung cancer and melanoma. Automatic segmentation of tumors in +PET/CT images can help reduce doctors' workload, thereby improving diagnostic +quality. However, precise tumor segmentation is challenging due to the small +size of many tumors and the similarity of high-uptake normal areas to the tumor +regions. To address these issues, this paper proposes a +localization-to-segmentation framework (L2SNet) for precise tumor segmentation. +L2SNet first localizes the possible lesions in the lesion localization phase +and then uses the location cues to shape the segmentation results in the lesion +segmentation phase. To further improve the segmentation performance of L2SNet, +we design an adaptive threshold scheme that takes the segmentation results of +the two phases into consideration. The experiments with the MICCAI 2023 +Automated Lesion Segmentation in Whole-Body FDG-PET/CT challenge dataset show +that our method achieved a competitive result and was ranked in the top 7 +methods on the preliminary test set. Our work is available at: +https://github.com/MedCAI/L2SNet. + +
+
+ comment: 7 pages,3 figures +
+
+
+
+
+ + ☆ Towards Content-based Pixel Retrieval in Revisited Oxford and Paris + + +
+ This paper introduces the first two pixel retrieval benchmarks. Pixel +retrieval is segmented instance retrieval. Like semantic segmentation extends +classification to the pixel level, pixel retrieval is an extension of image +retrieval and offers information about which pixels are related to the query +object. In addition to retrieving images for the given query, it helps users +quickly identify the query object in true positive images and exclude false +positive images by denoting the correlated pixels. Our user study results show +pixel-level annotation can significantly improve the user experience. + Compared with semantic and instance segmentation, pixel retrieval requires a +fine-grained recognition capability for variable-granularity targets. To this +end, we propose pixel retrieval benchmarks named PROxford and PRParis, which +are based on the widely used image retrieval datasets, ROxford and RParis. +Three professional annotators label 5,942 images with two rounds of +double-checking and refinement. Furthermore, we conduct extensive experiments +and analysis on the SOTA methods in image search, image matching, detection, +segmentation, and dense matching using our pixel retrieval benchmarks. Results +show that the pixel retrieval task is challenging to these approaches and +distinctive from existing problems, suggesting that further research can +advance the content-based pixel-retrieval and thus user search experience. The +datasets can be downloaded from +\href{https://github.com/anguoyuan/Pixel_retrieval-Segmented_instance_retrieval}{this +link}. + +
+
+
+
+
+ + ☆ FlowIBR: Leveraging Pre-Training for Efficient Neural Image-Based + Rendering of Dynamic Scenes + + +
+ We introduce a novel approach for monocular novel view synthesis of dynamic +scenes. Existing techniques already show impressive rendering quality but tend +to focus on optimization within a single scene without leveraging prior +knowledge. This limitation has been primarily attributed to the lack of +datasets of dynamic scenes available for training and the diversity of scene +dynamics. Our method FlowIBR circumvents these issues by integrating a neural +image-based rendering method, pre-trained on a large corpus of widely available +static scenes, with a per-scene optimized scene flow field. Utilizing this flow +field, we bend the camera rays to counteract the scene dynamics, thereby +presenting the dynamic scene as if it were static to the rendering network. The +proposed method reduces per-scene optimization time by an order of magnitude, +achieving comparable results to existing methods - all on a single +consumer-grade GPU. + +
+
+
+
+
+ + ☆ Treatment-aware Diffusion Probabilistic Model for Longitudinal MRI + Generation and Diffuse Glioma Growth Prediction + + +
+ Diffuse gliomas are malignant brain tumors that grow widespread through the +brain. The complex interactions between neoplastic cells and normal tissue, as +well as the treatment-induced changes often encountered, make glioma tumor +growth modeling challenging. In this paper, we present a novel end-to-end +network capable of generating future tumor masks and realistic MRIs of how the +tumor will look at any future time points for different treatment plans. Our +model is built upon cutting-edge diffusion probabilistic models and +deep-segmentation neural networks. We extended a diffusion model to include +sequential multi-parametric MRI and treatment information as conditioning input +to guide the generative diffusion process. This allows us to estimate tumor +growth at any given time point. We trained the model using real-world +postoperative longitudinal MRI data with glioma tumor growth trajectories +represented as tumor segmentation maps over time. The model has demonstrated +promising performance across a range of tasks, including the generation of +high-quality synthetic MRIs with tumor masks, time-series tumor segmentations, +and uncertainty estimation. Combined with the treatment-aware generated MRIs, +the tumor growth predictions with uncertainty estimates can provide useful +information for clinical decision-making. + +
+
+ comment: 13 pages, 10 figures, 2 tables, 2 agls, pre-print-v1 +
+
+
+
+
+ + ☆ Two-Stage Hybrid Supervision Framework for Fast, Low-resource, and + Accurate Organ and Pan-cancer Segmentation in Abdomen CT + + +
+ Abdominal organ and tumour segmentation has many important clinical +applications, such as organ quantification, surgical planning, and disease +diagnosis. However, manual assessment is inherently subjective with +considerable inter- and intra-expert variability. In the paper, we propose a +hybrid supervised framework, StMt, that integrates self-training and mean +teacher for the segmentation of abdominal organs and tumors using partially +labeled and unlabeled data. We introduce a two-stage segmentation pipeline and +whole-volume-based input strategy to maximize segmentation accuracy while +meeting the requirements of inference time and GPU memory usage. Experiments on +the validation set of FLARE2023 demonstrate that our method achieves excellent +segmentation performance as well as fast and low-resource model inference. Our +method achieved an average DSC score of 89.79\% and 45.55 \% for the organs and +lesions on the validation set and the average running time and area under GPU +memory-time cure are 11.25s and 9627.82MB, respectively. + +
+
+
+
+
+ + ☆ Robust Single Rotation Averaging Revisited + + +
+ In this work, we propose a novel method for robust single rotation averaging +that can efficiently handle an extremely large fraction of outliers. Our +approach is to minimize the total truncated least unsquared deviations (TLUD) +cost of geodesic distances. The proposed algorithm consists of three steps: +First, we consider each input rotation as a potential initial solution and +choose the one that yields the least sum of truncated chordal deviations. Next, +we obtain the inlier set using the initial solution and compute its chordal +$L_2$-mean. Finally, starting from this estimate, we iteratively compute the +geodesic $L_1$-mean of the inliers using the Weiszfeld algorithm on $SO(3)$. An +extensive evaluation shows that our method is robust against up to 99% outliers +given a sufficient number of accurate inliers, outperforming the current state +of the art. + +
+
+
+
+
+ + ☆ Collective PV-RCNN: A Novel Fusion Technique using Collective Detections + for Enhanced Local LiDAR-Based Perception SC 2023 + + +
+ Comprehensive perception of the environment is crucial for the safe operation +of autonomous vehicles. However, the perception capabilities of autonomous +vehicles are limited due to occlusions, limited sensor ranges, or environmental +influences. Collective Perception (CP) aims to mitigate these problems by +enabling the exchange of information between vehicles. A major challenge in CP +is the fusion of the exchanged information. Due to the enormous bandwidth +requirement of early fusion approaches and the interchangeability issues of +intermediate fusion approaches, only the late fusion of shared detections is +practical. Current late fusion approaches neglect valuable information for +local detection, this is why we propose a novel fusion method to fuse the +detections of cooperative vehicles within the local LiDAR-based detection +pipeline. Therefore, we present Collective PV-RCNN (CPV-RCNN), which extends +the PV-RCNN++ framework to fuse collective detections. Code is available at +https://github.com/ekut-es + +
+
+ comment: accepted at IEEE ITSC 2023 +
+
+
+
+
+ + ☆ CNN or ViT? Revisiting Vision Transformers Through the Lens of + Convolution + + +
+ The success of Vision Transformer (ViT) has been widely reported on a wide +range of image recognition tasks. The merit of ViT over CNN has been largely +attributed to large training datasets or auxiliary pre-training. Without +pre-training, the performance of ViT on small datasets is limited because the +global self-attention has limited capacity in local modeling. Towards boosting +ViT on small datasets without pre-training, this work improves its local +modeling by applying a weight mask on the original self-attention matrix. A +straightforward way to locally adapt the self-attention matrix can be realized +by an element-wise learnable weight mask (ELM), for which our preliminary +results show promising results. However, the element-wise simple learnable +weight mask not only induces a non-trivial additional parameter overhead but +also increases the optimization complexity. To this end, this work proposes a +novel Gaussian mixture mask (GMM) in which one mask only has two learnable +parameters and it can be conveniently used in any ViT variants whose attention +mechanism allows the use of masks. Experimental results on multiple small +datasets demonstrate that the effectiveness of our proposed Gaussian mask for +boosting ViTs for free (almost zero additional parameter or computation cost). +Our code will be publicly available at +\href{https://github.com/CatworldLee/Gaussian-Mixture-Mask-Attention}{https://github.com/CatworldLee/Gaussian-Mixture-Mask-Attention}. + +
+
+
+
+
+ + ☆ Learning Geometric Representations of Objects via Interaction + + +
+ We address the problem of learning representations from observations of a +scene involving an agent and an external object the agent interacts with. To +this end, we propose a representation learning framework extracting the +location in physical space of both the agent and the object from unstructured +observations of arbitrary nature. Our framework relies on the actions performed +by the agent as the only source of supervision, while assuming that the object +is displaced by the agent via unknown dynamics. We provide a theoretical +foundation and formally prove that an ideal learner is guaranteed to infer an +isometric representation, disentangling the agent from the object and correctly +extracting their locations. We evaluate empirically our framework on a variety +of scenarios, showing that it outperforms vision-based approaches such as a +state-of-the-art keypoint extractor. We moreover demonstrate how the extracted +representations enable the agent to solve downstream tasks via reinforcement +learning in an efficient manner. + +
+
+
+
+
+ + ☆ PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D + representations for agricultural robotics + + +
+ Precise scene understanding is key for most robot monitoring and intervention +tasks in agriculture. In this work we present PAg-NeRF which is a novel +NeRF-based system that enables 3D panoptic scene understanding. Our +representation is trained using an image sequence with noisy robot odometry +poses and automatic panoptic predictions with inconsistent IDs between frames. +Despite this noisy input, our system is able to output scene geometry, +photo-realistic renders and 3D consistent panoptic representations with +consistent instance IDs. We evaluate this novel system in a very challenging +horticultural scenario and in doing so demonstrate an end-to-end trainable +system that can make use of noisy robot poses rather than precise poses that +have to be pre-calculated. Compared to a baseline approach the peak signal to +noise ratio is improved from 21.34dB to 23.37dB while the panoptic quality +improves from 56.65% to 70.08%. Furthermore, our approach is faster and can be +tuned to improve inference time by more than a factor of 2 while being memory +efficient with approximately 12 times fewer parameters. + +
+
+
+
+
+ + ☆ MultIOD: Rehearsal-free Multihead Incremental Object Detector WACV 2024 + + +
+ Class-Incremental learning (CIL) is the ability of artificial agents to +accommodate new classes as they appear in a stream. It is particularly +interesting in evolving environments where agents have limited access to memory +and computational resources. The main challenge of class-incremental learning +is catastrophic forgetting, the inability of neural networks to retain past +knowledge when learning a new one. Unfortunately, most existing +class-incremental object detectors are applied to two-stage algorithms such as +Faster-RCNN and rely on rehearsal memory to retain past knowledge. We believe +that the current benchmarks are not realistic, and more effort should be +dedicated to anchor-free and rehearsal-free object detection. In this context, +we propose MultIOD, a class-incremental object detector based on CenterNet. Our +main contributions are: (1) we propose a multihead feature pyramid and +multihead detection architecture to efficiently separate class representations, +(2) we employ transfer learning between classes learned initially and those +learned incrementally to tackle catastrophic forgetting, and (3) we use a +class-wise non-max-suppression as a post-processing technique to remove +redundant boxes. Without bells and whistles, our method outperforms a range of +state-of-the-art methods on two Pascal VOC datasets. + +
+
+ comment: Under review at the WACV 2024 conference +
+
+
+
+
+ + ☆ Diff-Privacy: Diffusion-based Face Privacy Protection + + +
+ Privacy protection has become a top priority as the proliferation of AI +techniques has led to widespread collection and misuse of personal data. +Anonymization and visual identity information hiding are two important facial +privacy protection tasks that aim to remove identification characteristics from +facial images at the human perception level. However, they have a significant +difference in that the former aims to prevent the machine from recognizing +correctly, while the latter needs to ensure the accuracy of machine +recognition. Therefore, it is difficult to train a model to complete these two +tasks simultaneously. In this paper, we unify the task of anonymization and +visual identity information hiding and propose a novel face privacy protection +method based on diffusion models, dubbed Diff-Privacy. Specifically, we train +our proposed multi-scale image inversion module (MSI) to obtain a set of SDM +format conditional embeddings of the original image. Based on the conditional +embeddings, we design corresponding embedding scheduling strategies and +construct different energy functions during the denoising process to achieve +anonymization and visual identity information hiding. Extensive experiments +have been conducted to validate the effectiveness of our proposed framework in +protecting facial privacy. + +
+
+ comment: 17pages +
+
+
+
+
+ + ☆ Semantic Latent Decomposition with Normalizing Flows for Face Editing + + +
+ Navigating in the latent space of StyleGAN has shown effectiveness for face +editing. However, the resulting methods usually encounter challenges in +complicated navigation due to the entanglement among different attributes in +the latent space. To address this issue, this paper proposes a novel framework, +termed SDFlow, with a semantic decomposition in original latent space using +continuous conditional normalizing flows. Specifically, SDFlow decomposes the +original latent code into different irrelevant variables by jointly optimizing +two components: (i) a semantic encoder to estimate semantic variables from +input faces and (ii) a flow-based transformation module to map the latent code +into a semantic-irrelevant variable in Gaussian distribution, conditioned on +the learned semantic variables. To eliminate the entanglement between +variables, we employ a disentangled learning strategy under a mutual +information framework, thereby providing precise manipulation controls. +Experimental results demonstrate that SDFlow outperforms existing +state-of-the-art face editing methods both qualitatively and quantitatively. +The source code is made available at https://github.com/phil329/SDFlow. + +
+
+
+
+
+ + ☆ DeCUR: decoupling common & unique representations for multimodal + self-supervision + + +
+ The increasing availability of multi-sensor data sparks interest in +multimodal self-supervised learning. However, most existing approaches learn +only common representations across modalities while ignoring intra-modal +training and modality-unique representations. We propose Decoupling Common and +Unique Representations (DeCUR), a simple yet effective method for multimodal +self-supervised learning. By distinguishing inter- and intra-modal embeddings, +DeCUR is trained to integrate complementary information across different +modalities. We evaluate DeCUR in three common multimodal scenarios +(radar-optical, RGB-elevation, and RGB-depth), and demonstrate its consistent +benefits on scene classification and semantic segmentation downstream tasks. +Notably, we get straightforward improvements by transferring our pretrained +backbones to state-of-the-art supervised multimodal methods without any +hyperparameter tuning. Furthermore, we conduct a comprehensive explainability +analysis to shed light on the interpretation of common and unique features in +our multimodal approach. Codes are available at +\url{https://github.com/zhu-xlab/DeCUR}. + +
+
+ comment: 19 pages, 10 figures +
+
+
+
+
+ + ☆ Task-driven Compression for Collision Encoding based on Depth Images + + +
+ This paper contributes a novel learning-based method for aggressive +task-driven compression of depth images and their encoding as images tailored +to collision prediction for robotic systems. A novel 3D image processing +methodology is proposed that accounts for the robot's size in order to +appropriately "inflate" the obstacles represented in the depth image and thus +obtain the distance that can be traversed by the robot in a collision-free +manner along any given ray within the camera frustum. Such depth-and-collision +image pairs are used to train a neural network that follows the architecture of +Variational Autoencoders to compress-and-transform the information in the +original depth image to derive a latent representation that encodes the +collision information for the given depth image. We compare our proposed +task-driven encoding method with classical task-agnostic methods and +demonstrate superior performance for the task of collision image prediction +from extremely low-dimensional latent spaces. A set of comparative studies show +that the proposed approach is capable of encoding depth image-and-collision +image tuples from complex scenes with thin obstacles at long distances better +than the classical methods at compression ratios as high as 4050:1. + +
+
+ comment: 14 pages, 5, figures. Accepted to the International Symposium on + Visual Computing 2023 +
+
+
+
+
+ + ☆ Can you text what is happening? Integrating pre-trained language + encoders into trajectory prediction models for autonomous driving + + +
+ In autonomous driving tasks, scene understanding is the first step towards +predicting the future behavior of the surrounding traffic participants. Yet, +how to represent a given scene and extract its features are still open research +questions. In this study, we propose a novel text-based representation of +traffic scenes and process it with a pre-trained language encoder. + First, we show that text-based representations, combined with classical +rasterized image representations, lead to descriptive scene embeddings. Second, +we benchmark our predictions on the nuScenes dataset and show significant +improvements compared to baselines. Third, we show in an ablation study that a +joint encoder of text and rasterized images outperforms the individual encoders +confirming that both representations have their complementary strengths. + +
+
+
+
+
+ + ☆ Class-Incremental Grouping Network for Continual Audio-Visual Learning ICCV 2023 + + +
+ Continual learning is a challenging problem in which models need to be +trained on non-stationary data across sequential tasks for class-incremental +learning. While previous methods have focused on using either regularization or +rehearsal-based frameworks to alleviate catastrophic forgetting in image +classification, they are limited to a single modality and cannot learn compact +class-aware cross-modal representations for continual audio-visual learning. To +address this gap, we propose a novel class-incremental grouping network (CIGN) +that can learn category-wise semantic features to achieve continual +audio-visual learning. Our CIGN leverages learnable audio-visual class tokens +and audio-visual grouping to continually aggregate class-aware features. +Additionally, it utilizes class tokens distillation and continual grouping to +prevent forgetting parameters learned from previous tasks, thereby improving +the model's ability to capture discriminative audio-visual categories. We +conduct extensive experiments on VGGSound-Instruments, VGGSound-100, and +VGG-Sound Sources benchmarks. Our experimental results demonstrate that the +CIGN achieves state-of-the-art audio-visual class-incremental learning +performance. Code is available at https://github.com/stoneMo/CIGN. + +
+
+ comment: ICCV 2023. arXiv admin note: text overlap with arXiv:2303.17056 +
+
+
+
+
+ + ☆ Interactive Class-Agnostic Object Counting + + +
+ We propose a novel framework for interactive class-agnostic object counting, +where a human user can interactively provide feedback to improve the accuracy +of a counter. Our framework consists of two main components: a user-friendly +visualizer to gather feedback and an efficient mechanism to incorporate it. In +each iteration, we produce a density map to show the current prediction result, +and we segment it into non-overlapping regions with an easily verifiable number +of objects. The user can provide feedback by selecting a region with obvious +counting errors and specifying the range for the estimated number of objects +within it. To improve the counting result, we develop a novel adaptation loss +to force the visual counter to output the predicted count within the +user-specified range. For effective and efficient adaptation, we propose a +refinement module that can be used with any density-based visual counter, and +only the parameters in the refinement module will be updated during adaptation. +Our experiments on two challenging class-agnostic object counting benchmarks, +FSCD-LVIS and FSC-147, show that our method can reduce the mean absolute error +of multiple state-of-the-art visual counters by roughly 30% to 40% with minimal +user input. Our project can be found at +https://yifehuang97.github.io/ICACountProjectPage/. + +
+
+
+
+
+ + ☆ AutoFuse: Automatic Fusion Networks for Deformable Medical Image + Registration + + +
+ Deformable image registration aims to find a dense non-linear spatial +correspondence between a pair of images, which is a crucial step for many +medical tasks such as tumor growth monitoring and population analysis. +Recently, Deep Neural Networks (DNNs) have been widely recognized for their +ability to perform fast end-to-end registration. However, DNN-based +registration needs to explore the spatial information of each image and fuse +this information to characterize spatial correspondence. This raises an +essential question: what is the optimal fusion strategy to characterize spatial +correspondence? Existing fusion strategies (e.g., early fusion, late fusion) +were empirically designed to fuse information by manually defined prior +knowledge, which inevitably constrains the registration performance within the +limits of empirical designs. In this study, we depart from existing +empirically-designed fusion strategies and develop a data-driven fusion +strategy for deformable image registration. To achieve this, we propose an +Automatic Fusion network (AutoFuse) that provides flexibility to fuse +information at many potential locations within the network. A Fusion Gate (FG) +module is also proposed to control how to fuse information at each potential +network location based on training data. Our AutoFuse can automatically +optimize its fusion strategy during training and can be generalizable to both +unsupervised registration (without any labels) and semi-supervised registration +(with weak labels provided for partial training data). Extensive experiments on +two well-benchmarked medical registration tasks (inter- and intra-patient +registration) with eight public datasets show that our AutoFuse outperforms +state-of-the-art unsupervised and semi-supervised registration methods. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Diving into Darkness: A Dual-Modulated Framework for High-Fidelity + Super-Resolution in Ultra-Dark Environments + + +
+ Super-resolution tasks oriented to images captured in ultra-dark environments +is a practical yet challenging problem that has received little attention. Due +to uneven illumination and low signal-to-noise ratio in dark environments, a +multitude of problems such as lack of detail and color distortion may be +magnified in the super-resolution process compared to normal-lighting +environments. Consequently, conventional low-light enhancement or +super-resolution methods, whether applied individually or in a cascaded manner +for such problem, often encounter limitations in recovering luminance, color +fidelity, and intricate details. To conquer these issues, this paper proposes a +specialized dual-modulated learning framework that, for the first time, +attempts to deeply dissect the nature of the low-light super-resolution task. +Leveraging natural image color characteristics, we introduce a self-regularized +luminance constraint as a prior for addressing uneven lighting. Expanding on +this, we develop Illuminance-Semantic Dual Modulation (ISDM) components to +enhance feature-level preservation of illumination and color details. Besides, +instead of deploying naive up-sampling strategies, we design the +Resolution-Sensitive Merging Up-sampler (RSMU) module that brings together +different sampling modalities as substrates, effectively mitigating the +presence of artifacts and halos. Comprehensive experiments showcases the +applicability and generalizability of our approach to diverse and challenging +ultra-low-light conditions, outperforming state-of-the-art methods with a +notable improvement (i.e., $\uparrow$5\% in PSNR, and $\uparrow$43\% in LPIPS). +Especially noteworthy is the 19-fold increase in the RMSE score, underscoring +our method's exceptional generalization across different darkness levels. The +code will be available online upon publication of the paper. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ A horizon line annotation tool for streamlining autonomous sea + navigation experiments + + +
+ Horizon line (or sea line) detection (HLD) is a critical component in +multiple marine autonomous navigation tasks, such as identifying the navigation +area (i.e., the sea), obstacle detection and geo-localization, and digital +video stabilization. A recent survey highlighted several weaknesses of such +detectors, particularly on sea conditions lacking from the most extensive +dataset currently used by HLD researchers. Experimental validation of more +robust HLDs involves collecting an extensive set of these lacking sea +conditions and annotating each collected image with the correct position and +orientation of the horizon line. The annotation task is daunting without a +proper tool. Therefore, we present the first public annotation software with +tailored features to make the sea line annotation process fast and easy. The +software is available at: +https://drive.google.com/drive/folders/1c0ZmvYDckuQCPIWfh_70P7E1A_DWlIvF?usp=sharing + +
+
+
+
+
+ + ☆ Gall Bladder Cancer Detection from US Images with Only Image Level + Labels MICCAI 2023 + + +
+ Automated detection of Gallbladder Cancer (GBC) from Ultrasound (US) images +is an important problem, which has drawn increased interest from researchers. +However, most of these works use difficult-to-acquire information such as +bounding box annotations or additional US videos. In this paper, we focus on +GBC detection using only image-level labels. Such annotation is usually +available based on the diagnostic report of a patient, and do not require +additional annotation effort from the physicians. However, our analysis reveals +that it is difficult to train a standard image classification model for GBC +detection. This is due to the low inter-class variance (a malignant region +usually occupies only a small portion of a US image), high intra-class variance +(due to the US sensor capturing a 2D slice of a 3D object leading to large +viewpoint variations), and low training data availability. We posit that even +when we have only the image level label, still formulating the problem as +object detection (with bounding box output) helps a deep neural network (DNN) +model focus on the relevant region of interest. Since no bounding box +annotations is available for training, we pose the problem as weakly supervised +object detection (WSOD). Motivated by the recent success of transformer models +in object detection, we train one such model, DETR, using +multi-instance-learning (MIL) with self-supervised instance selection to suit +the WSOD task. Our proposed method demonstrates an improvement of AP and +detection sensitivity over the SOTA transformer-based and CNN-based WSOD +methods. Project page is at https://gbc-iitd.github.io/wsod-gbc + +
+
+ comment: Accepted at MICCAI 2023 +
+
+
+
+
+ + ☆ FusionFormer: A Multi-sensory Fusion in Bird's-Eye-View and Temporal + Consistent Transformer for 3D Objection + + +
+ Multi-sensor modal fusion has demonstrated strong advantages in 3D object +detection tasks. However, existing methods that fuse multi-modal features +through a simple channel concatenation require transformation features into +bird's eye view space and may lose the information on Z-axis thus leads to +inferior performance. To this end, we propose FusionFormer, an end-to-end +multi-modal fusion framework that leverages transformers to fuse multi-modal +features and obtain fused BEV features. And based on the flexible adaptability +of FusionFormer to the input modality representation, we propose a depth +prediction branch that can be added to the framework to improve detection +performance in camera-based detection tasks. In addition, we propose a +plug-and-play temporal fusion module based on transformers that can fuse +historical frame BEV features for more stable and reliable detection results. +We evaluate our method on the nuScenes dataset and achieve 72.6% mAP and 75.1% +NDS for 3D object detection tasks, outperforming state-of-the-art methods. + +
+
+
+
+
+ + ☆ Towards Better Data Exploitation In Self-Supervised Monocular Depth + Estimation + + +
+ Depth estimation plays an important role in the robotic perception system. +Self-supervised monocular paradigm has gained significant attention since it +can free training from the reliance on depth annotations. Despite recent +advancements, existing self-supervised methods still underutilize the available +training data, limiting their generalization ability. In this paper, we take +two data augmentation techniques, namely Resizing-Cropping and +Splitting-Permuting, to fully exploit the potential of training datasets. +Specifically, the original image and the generated two augmented images are fed +into the training pipeline simultaneously and we leverage them to conduct +self-distillation. Additionally, we introduce the detail-enhanced DepthNet with +an extra full-scale branch in the encoder and a grid decoder to enhance the +restoration of fine details in depth maps. Experimental results demonstrate our +method can achieve state-of-the-art performance on the KITTI benchmark, with +both raw ground truth and improved ground truth. Moreover, our models also show +superior generalization performance when transferring to Make3D and NYUv2 +datasets. Our codes are available at https://github.com/Sauf4896/BDEdepth. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Multi3DRefer: Grounding Text Description to Multiple 3D Objects ICCV 2023 + + +
+ We introduce the task of localizing a flexible number of objects in +real-world 3D scenes using natural language descriptions. Existing 3D visual +grounding tasks focus on localizing a unique object given a text description. +However, such a strict setting is unnatural as localizing potentially multiple +objects is a common need in real-world scenarios and robotic tasks (e.g., +visual navigation and object rearrangement). To address this setting we propose +Multi3DRefer, generalizing the ScanRefer dataset and task. Our dataset contains +61926 descriptions of 11609 objects, where zero, single or multiple target +objects are referenced by each description. We also introduce a new evaluation +metric and benchmark methods from prior work to enable further investigation of +multi-modal 3D scene understanding. Furthermore, we develop a better baseline +leveraging 2D features from CLIP by rendering object proposals online with +contrastive learning, which outperforms the state of the art on the ScanRefer +benchmark. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ HAT: Hybrid Attention Transformer for Image Restoration + + +
+ Transformer-based methods have shown impressive performance in image +restoration tasks, such as image super-resolution and denoising. However, we +find that these networks can only utilize a limited spatial range of input +information through attribution analysis. This implies that the potential of +Transformer is still not fully exploited in existing networks. In order to +activate more input pixels for better restoration, we propose a new Hybrid +Attention Transformer (HAT). It combines both channel attention and +window-based self-attention schemes, thus making use of their complementary +advantages. Moreover, to better aggregate the cross-window information, we +introduce an overlapping cross-attention module to enhance the interaction +between neighboring window features. In the training stage, we additionally +adopt a same-task pre-training strategy to further exploit the potential of the +model for further improvement. Extensive experiments have demonstrated the +effectiveness of the proposed modules. We further scale up the model to show +that the performance of the SR task can be greatly improved. Besides, we extend +HAT to more image restoration applications, including real-world image +super-resolution, Gaussian image denoising and image compression artifacts +reduction. Experiments on benchmark and real-world datasets demonstrate that +our HAT achieves state-of-the-art performance both quantitatively and +qualitatively. Codes and models are publicly available at +https://github.com/XPixelGroup/HAT. + +
+
+ comment: Extended version of HAT +
+
+
+
+
+ + ☆ SparseSwin: Swin Transformer with Sparse Transformer Block + + +
+ Advancements in computer vision research have put transformer architecture as +the state of the art in computer vision tasks. One of the known drawbacks of +the transformer architecture is the high number of parameters, this can lead to +a more complex and inefficient algorithm. This paper aims to reduce the number +of parameters and in turn, made the transformer more efficient. We present +Sparse Transformer (SparTa) Block, a modified transformer block with an +addition of a sparse token converter that reduces the number of tokens used. We +use the SparTa Block inside the Swin T architecture (SparseSwin) to leverage +Swin capability to downsample its input and reduce the number of initial tokens +to be calculated. The proposed SparseSwin model outperforms other state of the +art models in image classification with an accuracy of 86.96%, 97.43%, and +85.35% on the ImageNet100, CIFAR10, and CIFAR100 datasets respectively. Despite +its fewer parameters, the result highlights the potential of a transformer +architecture using a sparse token converter with a limited number of tokens to +optimize the use of the transformer and improve its performance. + +
+
+
+
+
+ + ☆ Angle Range and Identity Similarity Enhanced Gaze and Head Redirection + based on Synthetic data + + +
+ In this paper, we propose a method for improving the angular accuracy and +photo-reality of gaze and head redirection in full-face images. The problem +with current models is that they cannot handle redirection at large angles, and +this limitation mainly comes from the lack of training data. To resolve this +problem, we create data augmentation by monocular 3D face reconstruction to +extend the head pose and gaze range of the real data, which allows the model to +handle a wider redirection range. In addition to the main focus on data +augmentation, we also propose a framework with better image quality and +identity preservation of unseen subjects even training with synthetic data. +Experiments show that our method significantly improves redirection performance +in terms of redirection angular accuracy while maintaining high image quality, +especially when redirecting to large angles. + +
+
+
+
+
+ + ☆ Phase-Specific Augmented Reality Guidance for Microscopic Cataract + Surgery Using Long-Short Spatiotemporal Aggregation Transformer + + +
+ Phacoemulsification cataract surgery (PCS) is a routine procedure conducted +using a surgical microscope, heavily reliant on the skill of the +ophthalmologist. While existing PCS guidance systems extract valuable +information from surgical microscopic videos to enhance intraoperative +proficiency, they suffer from non-phasespecific guidance, leading to redundant +visual information. In this study, our major contribution is the development of +a novel phase-specific augmented reality (AR) guidance system, which offers +tailored AR information corresponding to the recognized surgical phase. +Leveraging the inherent quasi-standardized nature of PCS procedures, we propose +a two-stage surgical microscopic video recognition network. In the first stage, +we implement a multi-task learning structure to segment the surgical limbus +region and extract limbus region-focused spatial feature for each frame. In the +second stage, we propose the long-short spatiotemporal aggregation transformer +(LS-SAT) network to model local fine-grained and global temporal relationships, +and combine the extracted spatial features to recognize the current surgical +phase. Additionally, we collaborate closely with ophthalmologists to design AR +visual cues by utilizing techniques such as limbus ellipse fitting and regional +restricted normal cross-correlation rotation computation. We evaluated the +network on publicly available and in-house datasets, with comparison results +demonstrating its superior performance compared to related works. Ablation +results further validated the effectiveness of the limbus region-focused +spatial feature extractor and the combination of temporal features. +Furthermore, the developed system was evaluated in a clinical setup, with +results indicating remarkable accuracy and real-time performance. underscoring +its potential for clinical applications. + +
+
+
+
+
+ + ☆ Learning Sequential Acquisition Policies for Robot-Assisted Feeding + + +
+ A robot providing mealtime assistance must perform specialized maneuvers with +various utensils in order to pick up and feed a range of food items. Beyond +these dexterous low-level skills, an assistive robot must also plan these +strategies in sequence over a long horizon to clear a plate and complete a +meal. Previous methods in robot-assisted feeding introduce highly specialized +primitives for food handling without a means to compose them together. +Meanwhile, existing approaches to long-horizon manipulation lack the +flexibility to embed highly specialized primitives into their frameworks. We +propose Visual Action Planning OveR Sequences (VAPORS), a framework for +long-horizon food acquisition. VAPORS learns a policy for high-level action +selection by leveraging learned latent plate dynamics in simulation. To carry +out sequential plans in the real world, VAPORS delegates action execution to +visually parameterized primitives. We validate our approach on complex +real-world acquisition trials involving noodle acquisition and bimanual +scooping of jelly beans. Across 38 plates, VAPORS acquires much more +efficiently than baselines, generalizes across realistic plate variations such +as toppings and sauces, and qualitatively appeals to user feeding preferences +in a survey conducted across 49 individuals. Code, datasets, videos, and +supplementary materials can be found on our website: +https://sites.google.com/view/vaporsbot. + +
+
+
+
+
+ + ☆ Towards Viewpoint Robustness in Bird's Eye View Segmentation ICCV 2023 + + +
+ Autonomous vehicles (AV) require that neural networks used for perception be +robust to different viewpoints if they are to be deployed across many types of +vehicles without the repeated cost of data collection and labeling for each. AV +companies typically focus on collecting data from diverse scenarios and +locations, but not camera rig configurations, due to cost. As a result, only a +small number of rig variations exist across most fleets. In this paper, we +study how AV perception models are affected by changes in camera viewpoint and +propose a way to scale them across vehicle types without repeated data +collection and labeling. Using bird's eye view (BEV) segmentation as a +motivating task, we find through extensive experiments that existing perception +models are surprisingly sensitive to changes in camera viewpoint. When trained +with data from one camera rig, small changes to pitch, yaw, depth, or height of +the camera at inference time lead to large drops in performance. We introduce a +technique for novel view synthesis and use it to transform collected data to +the viewpoint of target rigs, allowing us to train BEV segmentation models for +diverse target rigs without any additional data collection or labeling cost. To +analyze the impact of viewpoint changes, we leverage synthetic data to mitigate +other gaps (content, ISP, etc). Our approach is then trained on real data and +evaluated on synthetic data, enabling evaluation on diverse target rigs. We +release all data for use in future work. Our method is able to recover an +average of 14.7% of the IoU that is otherwise lost when deploying to new rigs. + +
+
+ comment: ICCV 2023. Project Page: + https://nvlabs.github.io/viewpoint-robustness +
+
+
+
+
+ + ☆ HiLM-D: Towards High-Resolution Understanding in Multimodal Large + Language Models for Autonomous Driving + + +
+ Autonomous driving systems generally employ separate models for different +tasks resulting in intricate designs. For the first time, we leverage singular +multimodal large language models (MLLMs) to consolidate multiple autonomous +driving tasks from videos, i.e., the Risk Object Localization and Intention and +Suggestion Prediction (ROLISP) task. ROLISP uses natural language to +simultaneously identify and interpret risk objects, understand ego-vehicle +intentions, and provide motion suggestions, eliminating the necessity for +task-specific architectures. However, lacking high-resolution (HR) information, +existing MLLMs often miss small objects (e.g., traffic cones) and overly focus +on salient ones (e.g., large trucks) when applied to ROLISP. We propose HiLM-D +(Towards High-Resolution Understanding in MLLMs for Autonomous Driving), an +efficient method to incorporate HR information into MLLMs for the ROLISP task. +Especially, HiLM-D integrates two branches: (i) the low-resolution reasoning +branch, can be any MLLMs, processes low-resolution videos to caption risk +objects and discern ego-vehicle intentions/suggestions; (ii) the +high-resolution perception branch (HR-PB), prominent to HiLM-D,, ingests HR +images to enhance detection by capturing vision-specific HR feature maps and +prioritizing all potential risks over merely salient objects. Our HR-PB serves +as a plug-and-play module, seamlessly fitting into current MLLMs. Experiments +on the ROLISP benchmark reveal HiLM-D's notable advantage over leading MLLMs, +with improvements of 4.8% in BLEU-4 for captioning and 17.2% in mIoU for +detection. + +
+
+
+
+
+ + ☆ Our Deep CNN Face Matchers Have Developed Achromatopsia + + +
+ Modern deep CNN face matchers are trained on datasets containing color +images. We show that such matchers achieve essentially the same accuracy on the +grayscale or the color version of a set of test images. We then consider +possible causes for deep CNN face matchers ``not seeing color''. Popular +web-scraped face datasets actually have 30 to 60\% of their identities with one +or more grayscale images. We analyze whether this grayscale element in the +training set impacts the accuracy achieved, and conclude that it does not. +Further, we show that even with a 100\% grayscale training set, comparable +accuracy is achieved on color or grayscale test images. Then we show that the +skin region of an individual's images in a web-scraped training set exhibit +significant variation in their mapping to color space. This suggests that +color, at least for web-scraped, in-the-wild face datasets, carries limited +identity-related information for training state-of-the-art matchers. Finally, +we verify that comparable accuracy is achieved from training using +single-channel grayscale images, implying that a larger dataset can be used +within the same memory limit, with a less computationally intensive early +layer. + +
+
+
+
+
+ + ☆ DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning + + +
+ Prompt tuning (PT), where a small amount of trainable soft (continuous) +prompt vectors is affixed to the input of language models (LM), has shown +promising results across various tasks and models for parameter-efficient +fine-tuning (PEFT). PT stands out from other PEFT approaches because it +maintains competitive performance with fewer trainable parameters and does not +drastically scale up its parameters as the model size expands. However, PT +introduces additional soft prompt tokens, leading to longer input sequences, +which significantly impacts training and inference time and memory usage due to +the Transformer's quadratic complexity. Particularly concerning for Large +Language Models (LLMs) that face heavy daily querying. To address this issue, +we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt +into a shorter soft prompt and a pair of low-rank matrices that are then +optimised with two different learning rates. This allows DePT to achieve better +performance while saving over 20% memory and time costs compared to vanilla PT +and its variants, without changing trainable parameter sizes. Through extensive +experiments on 23 natural language processing (NLP) and vision-language (VL) +tasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches, +including the full fine-tuning baseline in some scenarios. Additionally, we +empirically show that DEPT grows more efficient as the model size increases. +Our further study reveals that DePT integrates seamlessly with +parameter-efficient transfer learning in the few-shot learning setting and +highlights its adaptability to various model architectures and sizes. + +
+
+ comment: Code is available at https://github.com/ZhengxiangShi/DePT +
+
+
+
+
+ + ☆ Radiomics Boosts Deep Learning Model for IPMN Classification MICCAI + + +
+ Intraductal Papillary Mucinous Neoplasm (IPMN) cysts are pre-malignant +pancreas lesions, and they can progress into pancreatic cancer. Therefore, +detecting and stratifying their risk level is of ultimate importance for +effective treatment planning and disease control. However, this is a highly +challenging task because of the diverse and irregular shape, texture, and size +of the IPMN cysts as well as the pancreas. In this study, we propose a novel +computer-aided diagnosis pipeline for IPMN risk classification from +multi-contrast MRI scans. Our proposed analysis framework includes an efficient +volumetric self-adapting segmentation strategy for pancreas delineation, +followed by a newly designed deep learning-based classification scheme with a +radiomics-based predictive approach. We test our proposed decision-fusion model +in multi-center data sets of 246 multi-contrast MRI scans and obtain superior +performance to the state of the art (SOTA) in this field. Our ablation studies +demonstrate the significance of both radiomics and deep learning modules for +achieving the new SOTA performance compared to international guidelines and +published studies (81.9\% vs 61.3\% in accuracy). Our findings have important +implications for clinical decision-making. In a series of rigorous experiments +on multi-center data sets (246 MRI scans from five centers), we achieved +unprecedented performance (81.9\% accuracy). + +
+
+ comment: 10 pages, MICCAI MLMI 2023 +
+
+
+
+
+ + ☆ Self-Correlation and Cross-Correlation Learning for Few-Shot Remote + Sensing Image Semantic Segmentation + + +
+ Remote sensing image semantic segmentation is an important problem for remote +sensing image interpretation. Although remarkable progress has been achieved, +existing deep neural network methods suffer from the reliance on massive +training data. Few-shot remote sensing semantic segmentation aims at learning +to segment target objects from a query image using only a few annotated support +images of the target class. Most existing few-shot learning methods stem +primarily from their sole focus on extracting information from support images, +thereby failing to effectively address the large variance in appearance and +scales of geographic objects. To tackle these challenges, we propose a +Self-Correlation and Cross-Correlation Learning Network for the few-shot remote +sensing image semantic segmentation. Our model enhances the generalization by +considering both self-correlation and cross-correlation between support and +query images to make segmentation predictions. To further explore the +self-correlation with the query image, we propose to adopt a classical spectral +method to produce a class-agnostic segmentation mask based on the basic visual +information of the image. Extensive experiments on two remote sensing image +datasets demonstrate the effectiveness and superiority of our model in few-shot +remote sensing image semantic segmentation. Code and models will be accessed at +https://github.com/linhanwang/SCCNe. + +
+
+ comment: 10 pages, 6 figures. Accepted to Sigspatial 2023. arXiv admin note: + text overlap with arXiv:2104.01538 by other authors +
+
+
+
+
+ + ☆ SCD-Net: Spatiotemporal Clues Disentanglement Network for + Self-supervised Skeleton-based Action Recognition + + +
+ Contrastive learning has achieved great success in skeleton-based action +recognition. However, most existing approaches encode the skeleton sequences as +entangled spatiotemporal representations and confine the contrasts to the same +level of representation. Instead, this paper introduces a novel contrastive +learning framework, namely Spatiotemporal Clues Disentanglement Network +(SCD-Net). Specifically, we integrate the decoupling module with a feature +extractor to derive explicit clues from spatial and temporal domains +respectively. As for the training of SCD-Net, with a constructed global anchor, +we encourage the interaction between the anchor and extracted clues. Further, +we propose a new masking strategy with structural constraints to strengthen the +contextual associations, leveraging the latest development from masked image +modelling into the proposed SCD-Net. We conduct extensive evaluations on the +NTU-RGB+D (60&120) and PKU-MMD (I&II) datasets, covering various downstream +tasks such as action recognition, action retrieval, transfer learning, and +semi-supervised learning. The experimental results demonstrate the +effectiveness of our method, which outperforms the existing state-of-the-art +(SOTA) approaches significantly. + +
+
+
+
+
+ + ☆ Instance-Agnostic Geometry and Contact Dynamics Learning + + +
+ This work presents an instance-agnostic learning framework that fuses vision +with dynamics to simultaneously learn shape, pose trajectories and physical +properties via the use of geometry as a shared representation. Unlike many +contact learning approaches that assume motion capture input and a known shape +prior for the collision model, our proposed framework learns an object's +geometric and dynamic properties from RGBD video, without requiring either +category-level or instance-level shape priors. We integrate a vision system, +BundleSDF, with a dynamics system, ContactNets and propose a cyclic training +pipeline to use the output from the dynamics module to refine the poses and the +geometry from the vision module, using perspective reprojection. Experiments +demonstrate our framework's ability to learn the geometry and dynamics of rigid +and convex objects and improve upon the current tracking framework. + +
+
+
+
+
+ + ☆ Mobile Vision Transformer-based Visual Object Tracking BMVC2023 + + +
+ The introduction of robust backbones, such as Vision Transformers, has +improved the performance of object tracking algorithms in recent years. +However, these state-of-the-art trackers are computationally expensive since +they have a large number of model parameters and rely on specialized hardware +(e.g., GPU) for faster inference. On the other hand, recent lightweight +trackers are fast but are less accurate, especially on large-scale datasets. We +propose a lightweight, accurate, and fast tracking algorithm using Mobile +Vision Transformers (MobileViT) as the backbone for the first time. We also +present a novel approach of fusing the template and search region +representations in the MobileViT backbone, thereby generating superior feature +encoding for target localization. The experimental results show that our +MobileViT-based Tracker, MVT, surpasses the performance of recent lightweight +trackers on the large-scale datasets GOT10k and TrackingNet, and with a high +inference speed. In addition, our method outperforms the popular DiMP-50 +tracker despite having 4.7 times fewer model parameters and running at 2.8 +times its speed on a GPU. The tracker code and models are available at +https://github.com/goutamyg/MVT + +
+
+ comment: Accepted by BMVC2023. Code available at + https://github.com/goutamyg/MVT +
+
+
+
+
+ + ☆ KD-FixMatch: Knowledge Distillation Siamese Neural Networks ICIP 2023 + + +
+ Semi-supervised learning (SSL) has become a crucial approach in deep learning +as a way to address the challenge of limited labeled data. The success of deep +neural networks heavily relies on the availability of large-scale high-quality +labeled data. However, the process of data labeling is time-consuming and +unscalable, leading to shortages in labeled data. SSL aims to tackle this +problem by leveraging additional unlabeled data in the training process. One of +the popular SSL algorithms, FixMatch, trains identical weight-sharing teacher +and student networks simultaneously using a siamese neural network (SNN). +However, it is prone to performance degradation when the pseudo labels are +heavily noisy in the early training stage. We present KD-FixMatch, a novel SSL +algorithm that addresses the limitations of FixMatch by incorporating knowledge +distillation. The algorithm utilizes a combination of sequential and +simultaneous training of SNNs to enhance performance and reduce performance +degradation. Firstly, an outer SNN is trained using labeled and unlabeled data. +After that, the network of the well-trained outer SNN generates pseudo labels +for the unlabeled data, from which a subset of unlabeled data with trusted +pseudo labels is then carefully created through high-confidence sampling and +deep embedding clustering. Finally, an inner SNN is trained with the labeled +data, the unlabeled data, and the subset of unlabeled data with trusted pseudo +labels. Experiments on four public data sets demonstrate that KD-FixMatch +outperforms FixMatch in all cases. Our results indicate that KD-FixMatch has a +better training starting point that leads to improved model performance +compared to FixMatch. + +
+
+ comment: 5 pages, 1 figure, 5 tables. To be published in ICIP 2023 +
+
+
+
+
+ + ☆ Rice Plant Disease Detection and Diagnosis using Deep Convolutional + Neural Networks and Multispectral Imaging + + +
+ Rice is considered a strategic crop in Egypt as it is regularly consumed in +the Egyptian people's diet. Even though Egypt is the highest rice producer in +Africa with a share of 6 million tons per year, it still imports rice to +satisfy its local needs due to production loss, especially due to rice disease. +Rice blast disease is responsible for 30% loss in rice production worldwide. +Therefore, it is crucial to target limiting yield damage by detecting rice +crops diseases in its early stages. This paper introduces a public +multispectral and RGB images dataset and a deep learning pipeline for rice +plant disease detection using multi-modal data. The collected multispectral +images consist of Red, Green and Near-Infrared channels and we show that using +multispectral along with RGB channels as input archives a higher F1 accuracy +compared to using RGB input only. + +
+
+
+
+
+ + ☆ SHIFT3D: Synthesizing Hard Inputs For Tricking 3D Detectors ICCV 2023 + + +
+ We present SHIFT3D, a differentiable pipeline for generating 3D shapes that +are structurally plausible yet challenging to 3D object detectors. In +safety-critical applications like autonomous driving, discovering such novel +challenging objects can offer insight into unknown vulnerabilities of 3D +detectors. By representing objects with a signed distanced function (SDF), we +show that gradient error signals allow us to smoothly deform the shape or pose +of a 3D object in order to confuse a downstream 3D detector. Importantly, the +objects generated by SHIFT3D physically differ from the baseline object yet +retain a semantically recognizable shape. Our approach provides interpretable +failure modes for modern 3D object detectors, and can aid in preemptive +discovery of potential safety risks within 3D perception systems before these +risks become critical failures. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Divergences in Color Perception between Deep Neural Networks and Humans + + +
+ Deep neural networks (DNNs) are increasingly proposed as models of human +vision, bolstered by their impressive performance on image classification and +object recognition tasks. Yet, the extent to which DNNs capture fundamental +aspects of human vision such as color perception remains unclear. Here, we +develop novel experiments for evaluating the perceptual coherence of color +embeddings in DNNs, and we assess how well these algorithms predict human color +similarity judgments collected via an online survey. We find that +state-of-the-art DNN architectures $-$ including convolutional neural networks +and vision transformers $-$ provide color similarity judgments that strikingly +diverge from human color judgments of (i) images with controlled color +properties, (ii) images generated from online searches, and (iii) real-world +images from the canonical CIFAR-10 dataset. We compare DNN performance against +an interpretable and cognitively plausible model of color perception based on +wavelet decomposition, inspired by foundational theories in computational +neuroscience. While one deep learning model $-$ a convolutional DNN trained on +a style transfer task $-$ captures some aspects of human color perception, our +wavelet algorithm provides more coherent color embeddings that better predict +human color judgments compared to all DNNs we examine. These results hold when +altering the high-level visual task used to train similar DNN architectures +(e.g., image classification versus image segmentation), as well as when +examining the color embeddings of different layers in a given DNN architecture. +These findings break new ground in the effort to analyze the perceptual +representations of machine learning algorithms and to improve their ability to +serve as cognitively plausible models of human vision. Implications for machine +learning, human perception, and embodied cognition are discussed. + +
+
+ comment: 22 pages, 8 figures + SI Appendix; to appear in Cognition +
+
+
+
+
+ + ☆ PhotoVerse: Tuning-Free Image Customization with Text-to-Image Diffusion + Models + + +
+ Personalized text-to-image generation has emerged as a powerful and +sought-after tool, empowering users to create customized images based on their +specific concepts and prompts. However, existing approaches to personalization +encounter multiple challenges, including long tuning times, large storage +requirements, the necessity for multiple input images per identity, and +limitations in preserving identity and editability. To address these obstacles, +we present PhotoVerse, an innovative methodology that incorporates a +dual-branch conditioning mechanism in both text and image domains, providing +effective control over the image generation process. Furthermore, we introduce +facial identity loss as a novel component to enhance the preservation of +identity during training. Remarkably, our proposed PhotoVerse eliminates the +need for test time tuning and relies solely on a single facial photo of the +target identity, significantly reducing the resource cost associated with image +generation. After a single training phase, our approach enables generating +high-quality images within only a few seconds. Moreover, our method can produce +diverse images that encompass various scenes and styles. The extensive +evaluation demonstrates the superior performance of our approach, which +achieves the dual objectives of preserving identity and facilitating +editability. Project page: https://photoverse2d.github.io/ + +
+
+
+
+
+ + ☆ Blendshapes GHUM: Real-time Monocular Facial Blendshape Prediction + + +
+ We present Blendshapes GHUM, an on-device ML pipeline that predicts 52 facial +blendshape coefficients at 30+ FPS on modern mobile phones, from a single +monocular RGB image and enables facial motion capture applications like virtual +avatars. Our main contributions are: i) an annotation-free offline method for +obtaining blendshape coefficients from real-world human scans, ii) a +lightweight real-time model that predicts blendshape coefficients based on +facial landmarks. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ LUNet: Deep Learning for the Segmentation of Arterioles and Venules in + High Resolution Fundus Images + + +
+ The retina is the only part of the human body in which blood vessels can be +accessed non-invasively using imaging techniques such as digital fundus images +(DFI). The spatial distribution of the retinal microvasculature may change with +cardiovascular diseases and thus the eyes may be regarded as a window to our +hearts. Computerized segmentation of the retinal arterioles and venules (A/V) +is essential for automated microvasculature analysis. Using active learning, we +created a new DFI dataset containing 240 crowd-sourced manual A/V segmentations +performed by fifteen medical students and reviewed by an ophthalmologist, and +developed LUNet, a novel deep learning architecture for high resolution A/V +segmentation. LUNet architecture includes a double dilated convolutional block +that aims to enhance the receptive field of the model and reduce its parameter +count. Furthermore, LUNet has a long tail that operates at high resolution to +refine the segmentation. The custom loss function emphasizes the continuity of +the blood vessels. LUNet is shown to significantly outperform two +state-of-the-art segmentation algorithms on the local test set as well as on +four external test sets simulating distribution shifts across ethnicity, +comorbidities, and annotators. We make the newly created dataset open access +(upon publication). + +
+
+
+
+
+ + ☆ TransferDoc: A Self-Supervised Transferable Document Representation + Learning Model Unifying Vision and Language + + +
+ The field of visual document understanding has witnessed a rapid growth in +emerging challenges and powerful multi-modal strategies. However, they rely on +an extensive amount of document data to learn their pretext objectives in a +``pre-train-then-fine-tune'' paradigm and thus, suffer a significant +performance drop in real-world online industrial settings. One major reason is +the over-reliance on OCR engines to extract local positional information within +a document page. Therefore, this hinders the model's generalizability, +flexibility and robustness due to the lack of capturing global information +within a document image. We introduce TransferDoc, a cross-modal +transformer-based architecture pre-trained in a self-supervised fashion using +three novel pretext objectives. TransferDoc learns richer semantic concepts by +unifying language and visual representations, which enables the production of +more transferable models. Besides, two novel downstream tasks have been +introduced for a ``closer-to-real'' industrial evaluation scenario where +TransferDoc outperforms other state-of-the-art approaches. + +
+
+ comment: Preprint to Pattern Recognition +
+
+
+
+
+ + ☆ Evaluating the Reliability of CNN Models on Classifying Traffic and Road + Signs using LIME + + +
+ The objective of this investigation is to evaluate and contrast the +effectiveness of four state-of-the-art pre-trained models, ResNet-34, VGG-19, +DenseNet-121, and Inception V3, in classifying traffic and road signs with the +utilization of the GTSRB public dataset. The study focuses on evaluating the +accuracy of these models' predictions as well as their ability to employ +appropriate features for image categorization. To gain insights into the +strengths and limitations of the model's predictions, the study employs the +local interpretable model-agnostic explanations (LIME) framework. The findings +of this experiment indicate that LIME is a crucial tool for improving the +interpretability and dependability of machine learning models for image +identification, regardless of the models achieving an f1 score of 0.99 on +classifying traffic and road signs. The conclusion of this study has important +ramifications for how these models are used in practice, as it is crucial to +ensure that model predictions are founded on the pertinent image features. + +
+
+ comment: Accepted for publication in the 2nd International Conference on Big + Data, IoT and Machine Learning (BIM 2023), 16 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Tell me what you see: A zero-shot action recognition method based on + natural language descriptions + + +
+ This paper presents a novel approach to Zero-Shot Action Recognition. Recent +works have explored the detection and classification of objects to obtain +semantic information from videos with remarkable performance. Inspired by them, +we propose using video captioning methods to extract semantic information about +objects, scenes, humans, and their relationships. To the best of our knowledge, +this is the first work to represent both videos and labels with descriptive +sentences. More specifically, we represent videos using sentences generated via +video captioning methods and classes using sentences extracted from documents +acquired through search engines on the Internet. Using these representations, +we build a shared semantic space employing BERT-based embedders pre-trained in +the paraphrasing task on multiple text datasets. The projection of both visual +and semantic information onto this space is straightforward, as they are +sentences, enabling classification using the nearest neighbor rule. We +demonstrate that representing videos and labels with sentences alleviates the +domain adaptation problem. Additionally, we show that word vectors are +unsuitable for building the semantic embedding space of our descriptions. Our +method outperforms the state-of-the-art performance on the UCF101 dataset by +3.3 p.p. in accuracy under the TruZe protocol and achieves competitive results +on both the UCF101 and HMDB51 datasets under the conventional protocol (0/50\% +- training/testing split). Our code is available at +https://github.com/valterlej/zsarcap. + +
+
+ comment: Published at Multimedia Tools and Applications +
+
+
+
+
+ + ♻ ☆ Distribution-Aligned Diffusion for Human Mesh Recovery ICCV 2023 + + +
+ Recovering a 3D human mesh from a single RGB image is a challenging task due +to depth ambiguity and self-occlusion, resulting in a high degree of +uncertainty. Meanwhile, diffusion models have recently seen much success in +generating high-quality outputs by progressively denoising noisy inputs. +Inspired by their capability, we explore a diffusion-based approach for human +mesh recovery, and propose a Human Mesh Diffusion (HMDiff) framework which +frames mesh recovery as a reverse diffusion process. We also propose a +Distribution Alignment Technique (DAT) that infuses prior distribution +information into the mesh distribution diffusion process, and provides useful +prior knowledge to facilitate the mesh recovery task. Our method achieves +state-of-the-art performance on three widely used datasets. Project page: +https://gongjia0208.github.io/HMDiff/. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ A soft nearest-neighbor framework for continual semi-supervised learning ICCV 2023 + + +
+ Despite significant advances, the performance of state-of-the-art continual +learning approaches hinges on the unrealistic scenario of fully labeled data. +In this paper, we tackle this challenge and propose an approach for continual +semi-supervised learning--a setting where not all the data samples are labeled. +A primary issue in this scenario is the model forgetting representations of +unlabeled data and overfitting the labeled samples. We leverage the power of +nearest-neighbor classifiers to nonlinearly partition the feature space and +flexibly model the underlying data distribution thanks to its non-parametric +nature. This enables the model to learn a strong representation for the current +task, and distill relevant information from previous tasks. We perform a +thorough experimental evaluation and show that our method outperforms all the +existing approaches by large margins, setting a solid state of the art on the +continual semi-supervised learning paradigm. For example, on CIFAR-100 we +surpass several others even when using at least 30 times less supervision (0.8% +vs. 25% of annotations). Finally, our method works well on both low and high +resolution images and scales seamlessly to more complex datasets such as +ImageNet-100. The code is publicly available on +https://github.com/kangzhiq/NNCSL + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SynBody: Synthetic Dataset with Layered Human Models for 3D Human + Perception and Modeling ICCV 2023 + + +
+ Synthetic data has emerged as a promising source for 3D human research as it +offers low-cost access to large-scale human datasets. To advance the diversity +and annotation quality of human models, we introduce a new synthetic dataset, +SynBody, with three appealing features: 1) a clothed parametric human model +that can generate a diverse range of subjects; 2) the layered human +representation that naturally offers high-quality 3D annotations to support +multiple tasks; 3) a scalable system for producing realistic data to facilitate +real-world tasks. The dataset comprises 1.2M images with corresponding accurate +3D annotations, covering 10,000 human body models, 1,187 actions, and various +viewpoints. The dataset includes two subsets for human pose and shape +estimation as well as human neural rendering. Extensive experiments on SynBody +indicate that it substantially enhances both SMPL and SMPL-X estimation. +Furthermore, the incorporation of layered annotations offers a valuable +training resource for investigating the Human Neural Radiance Fields (NeRF). + +
+
+ comment: Accepted by ICCV 2023. Project webpage: https://synbody.github.io/ +
+
+
+
+
+ + ♻ ☆ Deep grading for MRI-based differential diagnosis of Alzheimer's disease + and Frontotemporal dementia + + +
+ Alzheimer's disease and Frontotemporal dementia are common forms of +neurodegenerative dementia. Behavioral alterations and cognitive impairments +are found in the clinical courses of both diseases and their differential +diagnosis is sometimes difficult for physicians. Therefore, an accurate tool +dedicated to this diagnostic challenge can be valuable in clinical practice. +However, current structural imaging methods mainly focus on the detection of +each disease but rarely on their differential diagnosis. In this paper, we +propose a deep learning based approach for both problems of disease detection +and differential diagnosis. We suggest utilizing two types of biomarkers for +this application: structure grading and structure atrophy. First, we propose to +train a large ensemble of 3D U-Nets to locally determine the anatomical +patterns of healthy people, patients with Alzheimer's disease and patients with +Frontotemporal dementia using structural MRI as input. The output of the +ensemble is a 2-channel disease's coordinate map able to be transformed into a +3D grading map which is easy to interpret for clinicians. This 2-channel map is +coupled with a multi-layer perceptron classifier for different classification +tasks. Second, we propose to combine our deep learning framework with a +traditional machine learning strategy based on volume to improve the model +discriminative capacity and robustness. After both cross-validation and +external validation, our experiments based on 3319 MRI demonstrated competitive +results of our method compared to the state-of-the-art methods for both disease +detection and differential diagnosis. + +
+
+
+
+
+ + ♻ ☆ Robust Feature-Level Adversaries are Interpretability Tools NeurIPS 2022 + + +
+ The literature on adversarial attacks in computer vision typically focuses on +pixel-level perturbations. These tend to be very difficult to interpret. Recent +work that manipulates the latent representations of image generators to create +"feature-level" adversarial perturbations gives us an opportunity to explore +perceptible, interpretable adversarial attacks. We make three contributions. +First, we observe that feature-level attacks provide useful classes of inputs +for studying representations in models. Second, we show that these adversaries +are uniquely versatile and highly robust. We demonstrate that they can be used +to produce targeted, universal, disguised, physically-realizable, and black-box +attacks at the ImageNet scale. Third, we show how these adversarial images can +be used as a practical interpretability tool for identifying bugs in networks. +We use these adversaries to make predictions about spurious associations +between features and classes which we then test by designing "copy/paste" +attacks in which one natural image is pasted into another to cause a targeted +misclassification. Our results suggest that feature-level attacks are a +promising approach for rigorous interpretability research. They support the +design of tools to better understand what a model has learned and diagnose +brittle feature associations. Code is available at +https://github.com/thestephencasper/feature_level_adv + +
+
+ comment: NeurIPS 2022, code available at + https://github.com/thestephencasper/feature_level_adv +
+
+
+
+
+ + ♻ ☆ Dynamic Y-KD: A Hybrid Approach to Continual Instance Segmentation + + +
+ Despite the success of deep learning models on instance segmentation, current +methods still suffer from catastrophic forgetting in continual learning +scenarios. In this paper, our contributions for continual instance segmentation +are threefold. First, we propose the Y-knowledge distillation (Y-KD), a +technique that shares a common feature extractor between the teacher and +student networks. As the teacher is also updated with new data in Y-KD, the +increased plasticity results in new modules that are specialized on new +classes. Second, our Y-KD approach is supported by a dynamic architecture +method that trains task-specific modules with a unique instance segmentation +head, thereby significantly reducing forgetting. Third, we complete our +approach by leveraging checkpoint averaging as a simple method to manually +balance the trade-off between performance on the various sets of classes, thus +increasing control over the model's behavior without any additional cost. These +contributions are united in our model that we name the Dynamic Y-KD network. + We perform extensive experiments on several single-step and multi-steps +incremental learning scenarios, and we show that our approach outperforms +previous methods both on past and new classes. For instance, compared to recent +work, our method obtains +2.1% mAP on old classes in 15-1, +7.6% mAP on new +classes in 19-1 and reaches 91.5% of the mAP obtained by joint-training on all +classes in 15-5. + +
+
+
+
+
+ + ♻ ☆ Class-Incremental Learning of Plant and Disease Detection: Growing + Branches with Knowledge Distillation + + +
+ This paper investigates the problem of class-incremental object detection for +agricultural applications where a model needs to learn new plant species and +diseases incrementally without forgetting the previously learned ones. We adapt +two public datasets to include new categories over time, simulating a more +realistic and dynamic scenario. We then compare three class-incremental +learning methods that leverage different forms of knowledge distillation to +mitigate catastrophic forgetting. Our experiments show that all three methods +suffer from catastrophic forgetting, but the Dynamic Y-KD approach, which +additionally uses a dynamic architecture that grows new branches to learn new +tasks, outperforms ILOD and Faster-ILOD in most settings both on new and old +classes. + These results highlight the challenges and opportunities of continual object +detection for agricultural applications. In particular, we hypothesize that the +large intra-class and small inter-class variability that is typical of plant +images exacerbate the difficulty of learning new categories without interfering +with previous knowledge. We publicly release our code to encourage future work. + +
+
+ comment: Accepted at CVPPA'23 +
+
+
+
+
+ + ♻ ☆ Multi-scale, Data-driven and Anatomically Constrained Deep Learning + Image Registration for Adult and Fetal Echocardiography + + +
+ Temporal echocardiography image registration is a basis for clinical +quantifications such as cardiac motion estimation, myocardial strain +assessments, and stroke volume quantifications. In past studies, deep learning +image registration (DLIR) has shown promising results and is consistently +accurate and precise, requiring less computational time. We propose that a +greater focus on the warped moving image's anatomic plausibility and image +quality can support robust DLIR performance. Further, past implementations have +focused on adult echocardiography, and there is an absence of DLIR +implementations for fetal echocardiography. We propose a framework that +combines three strategies for DLIR in both fetal and adult echo: (1) an +anatomic shape-encoded loss to preserve physiological myocardial and left +ventricular anatomical topologies in warped images; (2) a data-driven loss that +is trained adversarially to preserve good image texture features in warped +images; and (3) a multi-scale training scheme of a data-driven and anatomically +constrained algorithm to improve accuracy. Our tests show that good anatomical +topology and image textures are strongly linked to shape-encoded and +data-driven adversarial losses. They improve different aspects of registration +performance in a non-overlapping way, justifying their combination. Despite +fundamental distinctions between adult and fetal echo images, we show that +these strategies can provide excellent registration results in both adult and +fetal echocardiography using the publicly available CAMUS adult echo dataset +and our private multi-demographic fetal echo dataset. Our approach outperforms +traditional non-DL gold standard registration approaches, including Optical +Flow and Elastix. Registration improvements could be translated to more +accurate and precise clinical quantification of cardiac ejection fraction, +demonstrating a potential for translation. + +
+
+ comment: Our data-driven and anatomically constrained DLIR method's source + code will be publicly available at https://github.com/kamruleee51/DdC-AC-DLIR +
+
+
+
+
+ + ♻ ☆ Are Deep Neural Networks SMARTer than Second Graders? CVPR 2023 + + +
+ Recent times have witnessed an increasing number of applications of deep +neural networks towards solving tasks that require superior cognitive +abilities, e.g., playing Go, generating art, ChatGPT, etc. Such a dramatic +progress raises the question: how generalizable are neural networks in solving +problems that demand broad skills? To answer this question, we propose SMART: a +Simple Multimodal Algorithmic Reasoning Task and the associated SMART-101 +dataset, for evaluating the abstraction, deduction, and generalization +abilities of neural networks in solving visuo-linguistic puzzles designed +specifically for children in the 6--8 age group. Our dataset consists of 101 +unique puzzles; each puzzle comprises a picture and a question, and their +solution needs a mix of several elementary skills, including arithmetic, +algebra, and spatial reasoning, among others. To scale our dataset towards +training deep neural networks, we programmatically generate entirely new +instances for each puzzle, while retaining their solution algorithm. To +benchmark performances on SMART-101, we propose a vision and language +meta-learning model using varied state-of-the-art backbones. Our experiments +reveal that while powerful deep models offer reasonable performances on puzzles +in a supervised setting, they are not better than random accuracy when analyzed +for generalization. We also evaluate the recent ChatGPT and other large +language models on a subset of SMART-101 and find that while these models show +convincing reasoning abilities, the answers are often incorrect. + +
+
+ comment: Extended version of CVPR 2023 paper. For the SMART-101 dataset, see + http://smartdataset.github.io/smart101 +
+
+
+
+
+ + ♻ ☆ Blending-NeRF: Text-Driven Localized Editing in Neural Radiance Fields ICCV 2023 + + +
+ Text-driven localized editing of 3D objects is particularly difficult as +locally mixing the original 3D object with the intended new object and style +effects without distorting the object's form is not a straightforward process. +To address this issue, we propose a novel NeRF-based model, Blending-NeRF, +which consists of two NeRF networks: pretrained NeRF and editable NeRF. +Additionally, we introduce new blending operations that allow Blending-NeRF to +properly edit target regions which are localized by text. By using a pretrained +vision-language aligned model, CLIP, we guide Blending-NeRF to add new objects +with varying colors and densities, modify textures, and remove parts of the +original object. Our extensive experiments demonstrate that Blending-NeRF +produces naturally and locally edited 3D objects from various text prompts. Our +project page is available at https://seokhunchoi.github.io/Blending-NeRF/ + +
+
+ comment: Accepted to ICCV 2023. The first two authors contributed equally to + this work +
+
+
+
+
+ + ♻ ☆ Reliable Joint Segmentation of Retinal Edema Lesions in OCT Images + + +
+ Focusing on the complicated pathological features, such as blurred +boundaries, severe scale differences between symptoms, background noise +interference, etc., in the task of retinal edema lesions joint segmentation +from OCT images and enabling the segmentation results more reliable. In this +paper, we propose a novel reliable multi-scale wavelet-enhanced transformer +network, which can provide accurate segmentation results with reliability +assessment. Specifically, aiming at improving the model's ability to learn the +complex pathological features of retinal edema lesions in OCT images, we +develop a novel segmentation backbone that integrates a wavelet-enhanced +feature extractor network and a multi-scale transformer module of our newly +designed. Meanwhile, to make the segmentation results more reliable, a novel +uncertainty segmentation head based on the subjective logical evidential theory +is introduced to generate the final segmentation results with a corresponding +overall uncertainty evaluation score map. We conduct comprehensive experiments +on the public database of AI-Challenge 2018 for retinal edema lesions +segmentation, and the results show that our proposed method achieves better +segmentation accuracy with a high degree of reliability as compared to other +state-of-the-art segmentation approaches. The code will be released on: +https://github.com/LooKing9218/ReliableRESeg. + +
+
+
+
+
+ + ♻ ☆ Task-Oriented Communication for Edge Video Analytics + + +
+ With the development of artificial intelligence (AI) techniques and the +increasing popularity of camera-equipped devices, many edge video analytics +applications are emerging, calling for the deployment of computation-intensive +AI models at the network edge. Edge inference is a promising solution to move +the computation-intensive workloads from low-end devices to a powerful edge +server for video analytics, but the device-server communications will remain a +bottleneck due to the limited bandwidth. This paper proposes a task-oriented +communication framework for edge video analytics, where multiple devices +collect the visual sensory data and transmit the informative features to an +edge server for processing. To enable low-latency inference, this framework +removes video redundancy in spatial and temporal domains and transmits minimal +information that is essential for the downstream task, rather than +reconstructing the videos at the edge server. Specifically, it extracts compact +task-relevant features based on the deterministic information bottleneck (IB) +principle, which characterizes a tradeoff between the informativeness of the +features and the communication cost. As the features of consecutive frames are +temporally correlated, we propose a temporal entropy model (TEM) to reduce the +bitrate by taking the previous features as side information in feature +encoding. To further improve the inference performance, we build a +spatial-temporal fusion module at the server to integrate features of the +current and previous frames for joint inference. Extensive experiments on video +analytics tasks evidence that the proposed framework effectively encodes +task-relevant information of video data and achieves a better rate-performance +tradeoff than existing methods. + +
+
+ comment: This paper was accepted to IEEE Transactions on Wireless + Communications (TWC) +
+
+
+
+
+ + ♻ ☆ Masked Autoencoders in 3D Point Cloud Representation Learning + + +
+ Transformer-based Self-supervised Representation Learning methods learn +generic features from unlabeled datasets for providing useful network +initialization parameters for downstream tasks. Recently, self-supervised +learning based upon masking local surface patches for 3D point cloud data has +been under-explored. In this paper, we propose masked Autoencoders in 3D point +cloud representation learning (abbreviated as MAE3D), a novel autoencoding +paradigm for self-supervised learning. We first split the input point cloud +into patches and mask a portion of them, then use our Patch Embedding Module to +extract the features of unmasked patches. Secondly, we employ patch-wise MAE3D +Transformers to learn both local features of point cloud patches and high-level +contextual relationships between patches and complete the latent +representations of masked patches. We use our Point Cloud Reconstruction Module +with multi-task loss to complete the incomplete point cloud as a result. We +conduct self-supervised pre-training on ShapeNet55 with the point cloud +completion pre-text task and fine-tune the pre-trained model on ModelNet40 and +ScanObjectNN (PB\_T50\_RS, the hardest variant). Comprehensive experiments +demonstrate that the local features extracted by our MAE3D from point cloud +patches are beneficial for downstream classification tasks, soundly +outperforming state-of-the-art methods ($93.4\%$ and $86.2\%$ classification +accuracy, respectively). + +
+
+ comment: Accepted to IEEE Transactions on Multimedia +
+
+
+
+
+ + ♻ ☆ ARNOLD: A Benchmark for Language-Grounded Task Learning With Continuous + States in Realistic 3D Scenes ICCV 2023 + + +
+ Understanding the continuous states of objects is essential for task learning +and planning in the real world. However, most existing task learning benchmarks +assume discrete (e.g., binary) object goal states, which poses challenges for +the learning of complex tasks and transferring learned policy from simulated +environments to the real world. Furthermore, state discretization limits a +robot's ability to follow human instructions based on the grounding of actions +and states. To tackle these challenges, we present ARNOLD, a benchmark that +evaluates language-grounded task learning with continuous states in realistic +3D scenes. ARNOLD is comprised of 8 language-conditioned tasks that involve +understanding object states and learning policies for continuous goals. To +promote language-instructed learning, we provide expert demonstrations with +template-generated language descriptions. We assess task performance by +utilizing the latest language-conditioned policy learning models. Our results +indicate that current models for language-conditioned manipulations continue to +experience significant challenges in novel goal-state generalizations, scene +generalizations, and object generalizations. These findings highlight the need +to develop new algorithms that address this gap and underscore the potential +for further research in this area. Project website: +https://arnold-benchmark.github.io. + +
+
+ comment: The first two authors contributed equally; 20 pages; 17 figures; + project availalbe: https://arnold-benchmark.github.io/ ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Document Understanding Dataset and Evaluation (DUDE) ICCV 2023 + + +
+ We call on the Document AI (DocAI) community to reevaluate current +methodologies and embrace the challenge of creating more practically-oriented +benchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to +remediate the halted research progress in understanding visually-rich documents +(VRDs). We present a new dataset with novelties related to types of questions, +answers, and document layouts based on multi-industry, multi-domain, and +multi-page VRDs of various origins, and dates. Moreover, we are pushing the +boundaries of current methods by creating multi-task and multi-domain +evaluation setups that more accurately simulate real-world situations where +powerful generalization and adaptation under low-resource settings are desired. +DUDE aims to set a new standard as a more practical, long-standing benchmark +for the community, and we hope that it will lead to future extensions and +contributions that address real-world challenges. Finally, our work illustrates +the importance of finding more efficient ways to model language, images, and +layout in DocAI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ MC-Blur: A Comprehensive Benchmark for Image Deblurring + + +
+ Blur artifacts can seriously degrade the visual quality of images, and +numerous deblurring methods have been proposed for specific scenarios. However, +in most real-world images, blur is caused by different factors, e.g., motion +and defocus. In this paper, we address how different deblurring methods perform +in the case of multiple types of blur. For in-depth performance evaluation, we +construct a new large-scale multi-cause image deblurring dataset (called +MC-Blur), including real-world and synthesized blurry images with mixed factors +of blurs. The images in the proposed MC-Blur dataset are collected using +different techniques: averaging sharp images captured by a 1000-fps high-speed +camera, convolving Ultra-High-Definition (UHD) sharp images with large-size +kernels, adding defocus to images, and real-world blurry images captured by +various camera models. Based on the MC-Blur dataset, we conduct extensive +benchmarking studies to compare SOTA methods in different scenarios, analyze +their efficiency, and investigate the built dataset's capacity. These +benchmarking results provide a comprehensive overview of the advantages and +limitations of current deblurring methods, and reveal the advances of our +dataset. + +
+
+ comment: To appear in IEEE TCSVT +
+
+
+
+
+ + ♻ ☆ End2End Multi-View Feature Matching with Differentiable Pose + Optimization ICCV 2023 + + +
+ Erroneous feature matches have severe impact on subsequent camera pose +estimation and often require additional, time-costly measures, like RANSAC, for +outlier rejection. Our method tackles this challenge by addressing feature +matching and pose optimization jointly. To this end, we propose a graph +attention network to predict image correspondences along with confidence +weights. The resulting matches serve as weighted constraints in a +differentiable pose estimation. Training feature matching with gradients from +pose optimization naturally learns to down-weight outliers and boosts pose +estimation on image pairs compared to SuperGlue by 6.7% on ScanNet. At the same +time, it reduces the pose estimation time by over 50% and renders RANSAC +iterations unnecessary. Moreover, we integrate information from multiple views +by spanning the graph across multiple frames to predict the matches all at +once. Multi-view matching combined with end-to-end training improves the pose +estimation metrics on Matterport3D by 18.5% compared to SuperGlue. + +
+
+ comment: ICCV 2023, project page: + https://barbararoessle.github.io/e2e_multi_view_matching , video: + https://youtu.be/uuLb6GfM9Cg +
+
+
+
+
+ + ♻ ☆ DWRSeg: Rethinking Efficient Acquisition of Multi-scale Contextual + Information for Real-time Semantic Segmentation + + +
+ Many current works directly adopt multi-rate depth-wise dilated convolutions +to capture multi-scale contextual information simultaneously from one input +feature map, thus improving the feature extraction efficiency for real-time +semantic segmentation. However, this design may lead to difficult access to +multi-scale contextual information because of the unreasonable structure and +hyperparameters. To lower the difficulty of drawing multi-scale contextual +information, we propose a highly efficient multi-scale feature extraction +method, which decomposes the original single-step method into two steps, Region +Residualization-Semantic Residualization.In this method, the multi-rate +depth-wise dilated convolutions take a simpler role in feature extraction: +performing simple semantic-based morphological filtering with one desired +receptive field in the second step based on each concise feature map of region +form provided by the first step, to improve their efficiency. Moreover, the +dilation rates and the capacity of dilated convolutions for each network stage +are elaborated to fully utilize all the feature maps of region form that can be +achieved.Accordingly, we design a novel Dilation-wise Residual (DWR) module and +a Simple Inverted Residual (SIR) module for the high and low level network, +respectively, and form a powerful DWR Segmentation (DWRSeg) network. Extensive +experiments on the Cityscapes and CamVid datasets demonstrate the effectiveness +of our method by achieving a state-of-the-art trade-off between accuracy and +inference speed, in addition to being lighter weight. Without pretraining or +resorting to any training trick, we achieve an mIoU of 72.7% on the Cityscapes +test set at a speed of 319.5 FPS on one NVIDIA GeForce GTX 1080 Ti card, which +exceeds the latest methods of a speed of 69.5 FPS and 0.8% mIoU. The code and +trained models are publicly available. + +
+
+
+
+
+ + ♻ ☆ ClipSitu: Effectively Leveraging CLIP for Conditional Predictions in + Situation Recognition + + +
+ Situation Recognition is the task of generating a structured summary of what +is happening in an image using an activity verb and the semantic roles played +by actors and objects. In this task, the same activity verb can describe a +diverse set of situations as well as the same actor or object category can play +a diverse set of semantic roles depending on the situation depicted in the +image. Hence a situation recognition model needs to understand the context of +the image and the visual-linguistic meaning of semantic roles. Therefore, we +leverage the CLIP foundational model that has learned the context of images via +language descriptions. We show that deeper-and-wider multi-layer perceptron +(MLP) blocks obtain noteworthy results for the situation recognition task by +using CLIP image and text embedding features and it even outperforms the +state-of-the-art CoFormer, a Transformer-based model, thanks to the external +implicit visual-linguistic knowledge encapsulated by CLIP and the expressive +power of modern MLP block designs. Motivated by this, we design a +cross-attention-based Transformer using CLIP visual tokens that model the +relation between textual roles and visual entities. Our cross-attention-based +Transformer known as ClipSitu XTF outperforms existing state-of-the-art by a +large margin of 14.1\% on semantic role labelling (value) for top-1 accuracy +using imSitu dataset. {Similarly, our ClipSitu XTF obtains state-of-the-art +situation localization performance.} We will make the code publicly available. + +
+
+ comment: State-of-the-art results on Grounded Situation Recognition +
+
+
+
+
+ + ♻ ☆ SegmentAnything helps microscopy images based automatic and quantitative + organoid detection and analysis SP + + +
+ Organoids are self-organized 3D cell clusters that closely mimic the +architecture and function of in vivo tissues and organs. Quantification of +organoid morphology helps in studying organ development, drug discovery, and +toxicity assessment. Recent microscopy techniques provide a potent tool to +acquire organoid morphology features, but manual image analysis remains a labor +and time-intensive process. Thus, this paper proposes a comprehensive pipeline +for microscopy analysis that leverages the SegmentAnything to precisely +demarcate individual organoids. Additionally, we introduce a set of +morphological properties, including perimeter, area, radius, non-smoothness, +and non-circularity, allowing researchers to analyze the organoid structures +quantitatively and automatically. To validate the effectiveness of our +approach, we conducted tests on bright-field images of human induced +pluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The +results obtained from our automatic pipeline closely align with manual organoid +detection and measurement, showcasing the capability of our proposed method in +accelerating organoids morphology analysis. + +
+
+ comment: submitted to SPIE: Medical Imaging 2024 +
+
+
+
+
+ + ♻ ☆ ExpCLIP: Bridging Text and Facial Expressions via Semantic Alignment + + +
+ The objective of stylized speech-driven facial animation is to create +animations that encapsulate specific emotional expressions. Existing methods +often depend on pre-established emotional labels or facial expression +templates, which may limit the necessary flexibility for accurately conveying +user intent. In this research, we introduce a technique that enables the +control of arbitrary styles by leveraging natural language as emotion prompts. +This technique presents benefits in terms of both flexibility and +user-friendliness. To realize this objective, we initially construct a +Text-Expression Alignment Dataset (TEAD), wherein each facial expression is +paired with several prompt-like descriptions.We propose an innovative automatic +annotation method, supported by Large Language Models (LLMs), to expedite the +dataset construction, thereby eliminating the substantial expense of manual +annotation. Following this, we utilize TEAD to train a CLIP-based model, termed +ExpCLIP, which encodes text and facial expressions into semantically aligned +style embeddings. The embeddings are subsequently integrated into the facial +animation generator to yield expressive and controllable facial animations. +Given the limited diversity of facial emotions in existing speech-driven facial +animation training data, we further introduce an effective Expression Prompt +Augmentation (EPA) mechanism to enable the animation generator to support +unprecedented richness in style control. Comprehensive experiments illustrate +that our method accomplishes expressive facial animation generation and offers +enhanced flexibility in effectively conveying the desired style. + +
+
+
+
+
+ + ♻ ☆ Multimodal Optimal Transport-based Co-Attention Transformer with Global + Structure Consistency for Survival Prediction ICCV 2023 + + +
+ Survival prediction is a complicated ordinal regression task that aims to +predict the ranking risk of death, which generally benefits from the +integration of histology and genomic data. Despite the progress in joint +learning from pathology and genomics, existing methods still suffer from +challenging issues: 1) Due to the large size of pathological images, it is +difficult to effectively represent the gigapixel whole slide images (WSIs). 2) +Interactions within tumor microenvironment (TME) in histology are essential for +survival analysis. Although current approaches attempt to model these +interactions via co-attention between histology and genomic data, they focus on +only dense local similarity across modalities, which fails to capture global +consistency between potential structures, i.e. TME-related interactions of +histology and co-expression of genomic data. To address these challenges, we +propose a Multimodal Optimal Transport-based Co-Attention Transformer framework +with global structure consistency, in which optimal transport (OT) is applied +to match patches of a WSI and genes embeddings for selecting informative +patches to represent the gigapixel WSI. More importantly, OT-based co-attention +provides a global awareness to effectively capture structural interactions +within TME for survival prediction. To overcome high computational complexity +of OT, we propose a robust and efficient implementation over micro-batch of WSI +patches by approximating the original OT with unbalanced mini-batch OT. +Extensive experiments show the superiority of our method on five benchmark +datasets compared to the state-of-the-art methods. The code is released. + +
+
+ comment: 11 pages, 4 figures, accepted by ICCV 2023 +
+
+
+
+
+ + ♻ ☆ VIGC: Visual Instruction Generation and Correction + + +
+ The integration of visual encoders and large language models (LLMs) has +driven recent progress in multimodal large language models (MLLMs). However, +the scarcity of high-quality instruction-tuning data for vision-language tasks +remains a challenge. The current leading paradigm, such as LLaVA, relies on +language-only GPT-4 to generate data, which requires pre-annotated image +captions and detection bounding boxes, suffering from understanding image +details. A practical solution to this problem would be to utilize the available +multimodal large language models (MLLMs) to generate instruction data for +vision-language tasks. However, it's worth noting that the currently accessible +MLLMs are not as powerful as their LLM counterparts, as they tend to produce +inadequate responses and generate false information. As a solution for +addressing the current issue, this paper proposes the Visual Instruction +Generation and Correction (VIGC) framework that enables multimodal large +language models to generate instruction-tuning data and progressively enhance +its quality on-the-fly. Specifically, Visual Instruction Generation (VIG) +guides the vision-language model to generate diverse instruction-tuning data. +To ensure generation quality, Visual Instruction Correction (VIC) adopts an +iterative update mechanism to correct any inaccuracies in data produced by VIG, +effectively reducing the risk of hallucination. Leveraging the diverse, +high-quality data generated by VIGC, we finetune mainstream models and validate +data quality based on various evaluations. Experimental results demonstrate +that VIGC not only compensates for the shortcomings of language-only data +generation methods, but also effectively enhances the benchmark performance. +The models, datasets, and code are available at +https://opendatalab.github.io/VIGC. + +
+
+ comment: Project Website: https://opendatalab.github.io/VIGC, Code and + Pretrained Model: https://github.com/opendatalab/VIGC, Dataset: + https://opendatalab.com/OpenDataLab/VIGC-InstData +
+
+
+
+
+ + ♻ ☆ MLLM-DataEngine: An Iterative Refinement Approach for MLLM + + +
+ Despite the great advance of Multimodal Large Language Models (MLLMs) in both +instruction dataset building and benchmarking, the independence of training and +evaluation makes current MLLMs hard to further improve their capability under +the guidance of evaluation results with a relatively low human cost. In this +paper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data +generation, model training, and evaluation. Within each loop iteration, the +MLLM-DataEngine first analyze the weakness of the model based on the evaluation +results, then generate a proper incremental dataset for the next training +iteration and enhance the model capability iteratively. Compared with previous +data collection methods which are separate from the benchmarking, the data +generated by MLLM-DataEngine shows better targeting, quality, and correctness. +For targeting, we propose an Adaptive Bad-case Sampling module, which adjusts +the ratio of different types of data within each incremental dataset based on +the benchmarking results. For quality, we resort to GPT-4 to generate +high-quality data with each given data type. For correctness, prompt design is +critical for the data generation results. Rather than previous hand-crafted +prompt, we propose an Interactive Prompt Optimization strategy, which optimizes +the prompt with the multi-round interaction between human and GPT, and improve +the correctness of generated data greatly. Through extensive experiments, we +find our MLLM-DataEngine could boost the MLLM capability in a targeted and +automatic manner, with only a few human participation. We hope it could be a +general solution for the following MLLMs building. The MLLM-DataEngine has been +open-sourced and is now available at +https://github.com/opendatalab/MLLM-DataEngine. + +
+
+ comment: Code and models are available at + https://github.com/opendatalab/MLLM-DataEngine +
+
+
+
+
+ + ♻ ☆ Stochastic Segmentation with Conditional Categorical Diffusion Models ICCV 2023 + + +
+ Semantic segmentation has made significant progress in recent years thanks to +deep neural networks, but the common objective of generating a single +segmentation output that accurately matches the image's content may not be +suitable for safety-critical domains such as medical diagnostics and autonomous +driving. Instead, multiple possible correct segmentation maps may be required +to reflect the true distribution of annotation maps. In this context, +stochastic semantic segmentation methods must learn to predict conditional +distributions of labels given the image, but this is challenging due to the +typically multimodal distributions, high-dimensional output spaces, and limited +annotation data. To address these challenges, we propose a conditional +categorical diffusion model (CCDM) for semantic segmentation based on Denoising +Diffusion Probabilistic Models. Our model is conditioned to the input image, +enabling it to generate multiple segmentation label maps that account for the +aleatoric uncertainty arising from divergent ground truth annotations. Our +experimental results show that CCDM achieves state-of-the-art performance on +LIDC, a stochastic semantic segmentation dataset, and outperforms established +baselines on the classical segmentation dataset Cityscapes. + +
+
+ comment: Accepted at ICCV 2023. Code available at + https://github.com/LarsDoorenbos/ccdm-stochastic-segmentation +
+
+
+
+
+ + ♻ ☆ Any-Size-Diffusion: Toward Efficient Text-Driven Synthesis for Any-Size + HD Images + + +
+ Stable diffusion, a generative model used in text-to-image synthesis, +frequently encounters resolution-induced composition problems when generating +images of varying sizes. This issue primarily stems from the model being +trained on pairs of single-scale images and their corresponding text +descriptions. Moreover, direct training on images of unlimited sizes is +unfeasible, as it would require an immense number of text-image pairs and +entail substantial computational expenses. To overcome these challenges, we +propose a two-stage pipeline named Any-Size-Diffusion (ASD), designed to +efficiently generate well-composed images of any size, while minimizing the +need for high-memory GPU resources. Specifically, the initial stage, dubbed Any +Ratio Adaptability Diffusion (ARAD), leverages a selected set of images with a +restricted range of ratios to optimize the text-conditional diffusion model, +thereby improving its ability to adjust composition to accommodate diverse +image sizes. To support the creation of images at any desired size, we further +introduce a technique called Fast Seamless Tiled Diffusion (FSTD) at the +subsequent stage. This method allows for the rapid enlargement of the ASD +output to any high-resolution size, avoiding seaming artifacts or memory +overloads. Experimental results on the LAION-COCO and MM-CelebA-HQ benchmarks +demonstrate that ASD can produce well-structured images of arbitrary sizes, +cutting down the inference time by 2x compared to the traditional tiled +algorithm. + +
+
+
+
+
+ + ♻ ☆ Interpreting and Correcting Medical Image Classification with PIP-Net ECAI 2023 + + +
+ Part-prototype models are explainable-by-design image classifiers, and a +promising alternative to black box AI. This paper explores the applicability +and potential of interpretable machine learning, in particular PIP-Net, for +automated diagnosis support on real-world medical imaging data. PIP-Net learns +human-understandable prototypical image parts and we evaluate its accuracy and +interpretability for fracture detection and skin cancer diagnosis. We find that +PIP-Net's decision making process is in line with medical classification +standards, while only provided with image-level class labels. Because of +PIP-Net's unsupervised pretraining of prototypes, data quality problems such as +undesired text in an X-ray or labelling errors can be easily identified. +Additionally, we are the first to show that humans can manually correct the +reasoning of PIP-Net by directly disabling undesired prototypes. We conclude +that part-prototype models are promising for medical applications due to their +interpretability and potential for advanced model debugging. + +
+
+ comment: Accepted to the International Workshop on Explainable and + Interpretable Machine Learning (XI-ML), co-located with ECAI 2023 +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Visual Question Answer Generation + + +
+ Growing interest in conversational agents promote twoway human-computer +communications involving asking and answering visual questions have become an +active area of research in AI. Thus, generation of visual questionanswer +pair(s) becomes an important and challenging task. To address this issue, we +propose a weakly-supervised visual question answer generation method that +generates a relevant question-answer pairs for a given input image and +associated caption. Most of the prior works are supervised and depend on the +annotated question-answer datasets. In our work, we present a weakly supervised +method that synthetically generates question-answer pairs procedurally from +visual information and captions. The proposed method initially extracts list of +answer words, then does nearest question generation that uses the caption and +answer word to generate synthetic question. Next, the relevant question +generator converts the nearest question to relevant language question by +dependency parsing and in-order tree traversal, finally, fine-tune a ViLBERT +model with the question-answer pair(s) generated at end. We perform an +exhaustive experimental analysis on VQA dataset and see that our model +significantly outperform SOTA methods on BLEU scores. We also show the results +wrt baseline models and ablation study. + +
+
+
+
+
+ + ♻ ☆ Lung Nodule Segmentation and Uncertain Region Prediction with an + Uncertainty-Aware Attention Mechanism MICCAI 2022 + + +
+ Radiologists possess diverse training and clinical experiences, leading to +variations in the segmentation annotations of lung nodules and resulting in +segmentation uncertainty.Conventional methods typically select a single +annotation as the learning target or attempt to learn a latent space comprising +multiple annotations. However, these approaches fail to leverage the valuable +information inherent in the consensus and disagreements among the multiple +annotations. In this paper, we propose an Uncertainty-Aware Attention Mechanism +(UAAM) that utilizes consensus and disagreements among multiple annotations to +facilitate better segmentation. To this end, we introduce the Multi-Confidence +Mask (MCM), which combines a Low-Confidence (LC) Mask and a High-Confidence +(HC) Mask.The LC mask indicates regions with low segmentation confidence, where +radiologists may have different segmentation choices. Following UAAM, we +further design an Uncertainty-Guide Multi-Confidence Segmentation Network +(UGMCS-Net), which contains three modules: a Feature Extracting Module that +captures a general feature of a lung nodule, an Uncertainty-Aware Module that +produces three features for the the annotations' union, intersection, and +annotation set, and an Intersection-Union Constraining Module that uses +distances between the three features to balance the predictions of final +segmentation and MCM. To comprehensively demonstrate the performance of our +method, we propose a Complex Nodule Validation on LIDC-IDRI, which tests +UGMCS-Net's segmentation performance on lung nodules that are difficult to +segment using common methods. Experimental results demonstrate that our method +can significantly improve the segmentation performance on nodules that are +difficult to segment using conventional methods. + +
+
+ comment: 10 pages, 10 figures. We have reported a preliminary version of this + work in MICCAI 2022 +
+
+
+
+
+ + ♻ ☆ Understanding Video Scenes through Text: Insights from Text-based Video + Question Answering + + +
+ Researchers have extensively studied the field of vision and language, +discovering that both visual and textual content is crucial for understanding +scenes effectively. Particularly, comprehending text in videos holds great +significance, requiring both scene text understanding and temporal reasoning. +This paper focuses on exploring two recently introduced datasets, NewsVideoQA +and M4-ViteVQA, which aim to address video question answering based on textual +content. The NewsVideoQA dataset contains question-answer pairs related to the +text in news videos, while M4-ViteVQA comprises question-answer pairs from +diverse categories like vlogging, traveling, and shopping. We provide an +analysis of the formulation of these datasets on various levels, exploring the +degree of visual understanding and multi-frame comprehension required for +answering the questions. Additionally, the study includes experimentation with +BERT-QA, a text-only model, which demonstrates comparable performance to the +original methods on both datasets, indicating the shortcomings in the +formulation of these datasets. Furthermore, we also look into the domain +adaptation aspect by examining the effectiveness of training on M4-ViteVQA and +evaluating on NewsVideoQA and vice-versa, thereby shedding light on the +challenges and potential benefits of out-of-domain training. + +
+
+
+
+
+ + ♻ ☆ Exploring Semantic Consistency in Unpaired Image Translation to Generate + Data for Surgical Applications + + +
+ In surgical computer vision applications, obtaining labeled training data is +challenging due to data-privacy concerns and the need for expert annotation. +Unpaired image-to-image translation techniques have been explored to +automatically generate large annotated datasets by translating synthetic images +to the realistic domain. However, preserving the structure and semantic +consistency between the input and translated images presents significant +challenges, mainly when there is a distributional mismatch in the semantic +characteristics of the domains. This study empirically investigates unpaired +image translation methods for generating suitable data in surgical +applications, explicitly focusing on semantic consistency. We extensively +evaluate various state-of-the-art image translation models on two challenging +surgical datasets and downstream semantic segmentation tasks. We find that a +simple combination of structural-similarity loss and contrastive learning +yields the most promising results. Quantitatively, we show that the data +generated with this approach yields higher semantic consistency and can be used +more effectively as training data. + +
+
+
+
+
+ + ♻ ☆ SeiT: Storage-Efficient Vision Training with Tokens Using 1% of Pixel + Storage ICCV 2023 + + +
+ We need billion-scale images to achieve more generalizable and +ground-breaking vision models, as well as massive dataset storage to ship the +images (e.g., the LAION-4B dataset needs 240TB storage space). However, it has +become challenging to deal with unlimited dataset storage with limited storage +infrastructure. A number of storage-efficient training methods have been +proposed to tackle the problem, but they are rarely scalable or suffer from +severe damage to performance. In this paper, we propose a storage-efficient +training strategy for vision classifiers for large-scale datasets (e.g., +ImageNet) that only uses 1024 tokens per instance without using the raw level +pixels; our token storage only needs <1% of the original JPEG-compressed raw +pixels. We also propose token augmentations and a Stem-adaptor module to make +our approach able to use the same architecture as pixel-based approaches with +only minimal modifications on the stem layer and the carefully tuned +optimization settings. Our experimental results on ImageNet-1k show that our +method significantly outperforms other storage-efficient training methods with +a large gap. We further show the effectiveness of our method in other practical +scenarios, storage-efficient pre-training, and continual learning. Code is +available at https://github.com/naver-ai/seit + +
+
+ comment: ICCV 2023; First two authors contributed equally; code url: + https://github.com/naver-ai/seit; 17 pages, 1.2MB +
+
+
+
+
+ + ♻ ☆ Boosting Adversarial Transferability with Learnable Patch-wise Masks + + +
+ Adversarial examples have attracted widespread attention in security-critical +applications because of their transferability across different models. Although +many methods have been proposed to boost adversarial transferability, a gap +still exists between capabilities and practical demand. In this paper, we argue +that the model-specific discriminative regions are a key factor causing +overfitting to the source model, and thus reducing the transferability to the +target model. For that, a patch-wise mask is utilized to prune the +model-specific regions when calculating adversarial perturbations. To +accurately localize these regions, we present a learnable approach to +automatically optimize the mask. Specifically, we simulate the target models in +our framework, and adjust the patch-wise mask according to the feedback of the +simulated models. To improve the efficiency, the differential evolutionary (DE) +algorithm is utilized to search for patch-wise masks for a specific image. +During iterative attacks, the learned masks are applied to the image to drop +out the patches related to model-specific regions, thus making the gradients +more generic and improving the adversarial transferability. The proposed +approach is a preprocessing method and can be integrated with existing methods +to further boost the transferability. Extensive experiments on the ImageNet +dataset demonstrate the effectiveness of our method. We incorporate the +proposed approach with existing methods to perform ensemble attacks and achieve +an average success rate of 93.01% against seven advanced defense methods, which +can effectively enhance the state-of-the-art transfer-based attack performance. + +
+
+
+
+
+ + ♻ ☆ DeePoint: Visual Pointing Recognition and Direction Estimation ICCV 2023 + + +
+ In this paper, we realize automatic visual recognition and direction +estimation of pointing. We introduce the first neural pointing understanding +method based on two key contributions. The first is the introduction of a +first-of-its-kind large-scale dataset for pointing recognition and direction +estimation, which we refer to as the DP Dataset. DP Dataset consists of more +than 2 million frames of 33 people pointing in various styles annotated for +each frame with pointing timings and 3D directions. The second is DeePoint, a +novel deep network model for joint recognition and 3D direction estimation of +pointing. DeePoint is a Transformer-based network which fully leverages the +spatio-temporal coordination of the body parts, not just the hands. Through +extensive experiments, we demonstrate the accuracy and efficiency of DeePoint. +We believe DP Dataset and DeePoint will serve as a sound foundation for visual +human intention understanding. + +
+
+ comment: to be published in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ TextManiA: Enriching Visual Feature by Text-driven Manifold Augmentation ICCV 2023 + + +
+ We propose TextManiA, a text-driven manifold augmentation method that +semantically enriches visual feature spaces, regardless of class distribution. +TextManiA augments visual data with intra-class semantic perturbation by +exploiting easy-to-understand visually mimetic words, i.e., attributes. This +work is built on an interesting hypothesis that general language models, e.g., +BERT and GPT, encompass visual information to some extent, even without +training on visual training data. Given the hypothesis, TextManiA transfers +pre-trained text representation obtained from a well-established large language +encoder to a target visual feature space being learned. Our extensive analysis +hints that the language encoder indeed encompasses visual information at least +useful to augment visual representation. Our experiments demonstrate that +TextManiA is particularly powerful in scarce samples with class imbalance as +well as even distribution. We also show compatibility with the label mix-based +approaches in evenly distributed scarce data. + +
+
+ comment: Accepted at ICCV 2023. [Project Pages] https://textmania.github.io/ +
+
+
+
+
+ + ♻ ☆ SYNAuG: Exploiting Synthetic Data for Data Imbalance Problems + + +
+ We live in an era of data floods, and deep neural networks play a pivotal +role in this moment. Natural data inherently exhibits several challenges such +as long-tailed distribution and model fairness, where data imbalance is at the +center of fundamental issues. This imbalance poses a risk of deep neural +networks producing biased predictions, leading to potentially severe ethical +and social problems. To address these problems, we leverage the recent +generative models advanced in generating high-quality images. In this work, we +propose SYNAuG, which utilizes synthetic data to uniformize the given imbalance +distribution followed by a simple post-calibration step considering the domain +gap between real and synthetic data. This straightforward approach yields +impressive performance on datasets for distinctive data imbalance problems such +as CIFAR100-LT, ImageNet100-LT, UTKFace, and Waterbirds, surpassing the +performance of existing task-specific methods. While we do not claim that our +approach serves as a complete solution to the problem of data imbalance, we +argue that supplementing the existing data with synthetic data proves to be an +effective and crucial step in addressing data imbalance concerns. + +
+
+
+
+
+ + ♻ ☆ Self-supervised Learning to Bring Dual Reversed Rolling Shutter Images + Alive SC + + +
+ Modern consumer cameras usually employ the rolling shutter (RS) mechanism, +where images are captured by scanning scenes row-by-row, yielding RS +distortions for dynamic scenes. To correct RS distortions, existing methods +adopt a fully supervised learning manner, where high framerate global shutter +(GS) images should be collected as ground-truth supervision. In this paper, we +propose a Self-supervised learning framework for Dual reversed RS distortions +Correction (SelfDRSC), where a DRSC network can be learned to generate a high +framerate GS video only based on dual RS images with reversed distortions. In +particular, a bidirectional distortion warping module is proposed for +reconstructing dual reversed RS images, and then a self-supervised loss can be +deployed to train DRSC network by enhancing the cycle consistency between input +and reconstructed dual reversed RS images. Besides start and end RS scanning +time, GS images at arbitrary intermediate scanning time can also be supervised +in SelfDRSC, thus enabling the learned DRSC network to generate a high +framerate GS video. Moreover, a simple yet effective self-distillation strategy +is introduced in self-supervised loss for mitigating boundary artifacts in +generated GS images. On synthetic dataset, SelfDRSC achieves better or +comparable quantitative metrics in comparison to state-of-the-art methods +trained in the full supervision manner. On real-world RS cases, our SelfDRSC +can produce high framerate GS videos with finer correction textures and better +temporary consistency. The source code and trained models are made publicly +available at https://github.com/shangwei5/SelfDRSC. We also provide an +implementation in HUAWEI Mindspore at +https://github.com/Hunter-Will/SelfDRSC-mindspore. + +
+
+ comment: 16 pages, 16 figures, available at + https://github.com/shangwei5/SelfDRSC +
+
+
+
+
+ + ♻ ☆ Noise-Tolerant Learning for Audio-Visual Action Recognition + + +
+ Recently, video recognition is emerging with the help of multi-modal +learning, which focuses on integrating distinct modalities to improve the +performance or robustness of the model. Although various multi-modal learning +methods have been proposed and offer remarkable recognition results, almost all +of these methods rely on high-quality manual annotations and assume that +modalities among multi-modal data provide semantically relevant information. +Unfortunately, the widely used video datasets are usually coarse-annotated or +collected from the Internet. Thus, it inevitably contains a portion of noisy +labels and noisy correspondence. To address this challenge, we use the +audio-visual action recognition task as a proxy and propose a noise-tolerant +learning framework to find anti-interference model parameters against both +noisy labels and noisy correspondence. Specifically, our method consists of two +phases that aim to rectify noise by the inherent correlation between +modalities. First, a noise-tolerant contrastive training phase is performed to +make the model immune to the possible noisy-labeled data. To alleviate the +influence of noisy correspondence, we propose a cross-modal noise estimation +component to adjust the consistency between different modalities. As the noisy +correspondence existed at the instance level, we further propose a +category-level contrastive loss to reduce its interference. Second, in the +hybrid-supervised training phase, we calculate the distance metric among +features to obtain corrected labels, which are used as complementary +supervision to guide the training. Extensive experiments on a wide range of +noisy levels demonstrate that our method significantly improves the robustness +of the action recognition model and surpasses the baselines by a clear margin. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ The Change You Want to See (Now in 3D) + + +
+ The goal of this paper is to detect what has changed, if anything, between +two "in the wild" images of the same 3D scene acquired from different camera +positions and at different temporal instances. The open-set nature of this +problem, occlusions/dis-occlusions due to the shift in viewpoint, and the lack +of suitable training datasets, presents substantial challenges in devising a +solution. + To address this problem, we contribute a change detection model that is +trained entirely on synthetic data and is class-agnostic, yet it is performant +out-of-the-box on real world images without requiring fine-tuning. Our solution +entails a "register and difference" approach that leverages self-supervised +frozen embeddings and feature differences, which allows the model to generalise +to a wide variety of scenes and domains. The model is able to operate directly +on two RGB images, without requiring access to ground truth camera intrinsics, +extrinsics, depth maps, point clouds, or additional before-after images. +Finally, we collect and release a new evaluation dataset consisting of +real-world image pairs with human-annotated differences and demonstrate the +efficacy of our method. The code, datasets and pre-trained model can be found +at: https://github.com/ragavsachdeva/CYWS-3D + +
+
+
+
+
+ + ♻ ☆ High Frequency, High Accuracy Pointing onboard Nanosats using + Neuromorphic Event Sensing and Piezoelectric Actuation + + +
+ As satellites become smaller, the ability to maintain stable pointing +decreases as external forces acting on the satellite come into play. At the +same time, reaction wheels used in the attitude determination and control +system (ADCS) introduce high frequency jitter which can disrupt pointing +stability. For space domain awareness (SDA) tasks that track objects tens of +thousands of kilometres away, the pointing accuracy offered by current +nanosats, typically in the range of 10 to 100 arcseconds, is not sufficient. In +this work, we develop a novel payload that utilises a neuromorphic event sensor +(for high frequency and highly accurate relative attitude estimation) paired in +a closed loop with a piezoelectric stage (for active attitude corrections) to +provide highly stable sensor-specific pointing. Event sensors are especially +suited for space applications due to their desirable characteristics of low +power consumption, asynchronous operation, and high dynamic range. We use the +event sensor to first estimate a reference background star field from which +instantaneous relative attitude is estimated at high frequency. The +piezoelectric stage works in a closed control loop with the event sensor to +perform attitude corrections based on the discrepancy between the current and +desired attitude. Results in a controlled setting show that we can achieve a +pointing accuracy in the range of 1-5 arcseconds using our novel payload at an +operating frequency of up to 50Hz using a prototype built from +commercial-off-the-shelf components. Further details can be found at +https://ylatif.github.io/ultrafinestabilisation + +
+
+
+
+
+ + ♻ ☆ Multi-stage Factorized Spatio-Temporal Representation for RGB-D Action + and Gesture Recognition ACM MM'23 + + +
+ RGB-D action and gesture recognition remain an interesting topic in +human-centered scene understanding, primarily due to the multiple granularities +and large variation in human motion. Although many RGB-D based action and +gesture recognition approaches have demonstrated remarkable results by +utilizing highly integrated spatio-temporal representations across multiple +modalities (i.e., RGB and depth data), they still encounter several challenges. +Firstly, vanilla 3D convolution makes it hard to capture fine-grained motion +differences between local clips under different modalities. Secondly, the +intricate nature of highly integrated spatio-temporal modeling can lead to +optimization difficulties. Thirdly, duplicate and unnecessary information can +add complexity and complicate entangled spatio-temporal modeling. To address +the above issues, we propose an innovative heuristic architecture called +Multi-stage Factorized Spatio-Temporal (MFST) for RGB-D action and gesture +recognition. The proposed MFST model comprises a 3D Central Difference +Convolution Stem (CDC-Stem) module and multiple factorized spatio-temporal +stages. The CDC-Stem enriches fine-grained temporal perception, and the +multiple hierarchical spatio-temporal stages construct dimension-independent +higher-order semantic primitives. Specifically, the CDC-Stem module captures +bottom-level spatio-temporal features and passes them successively to the +following spatio-temporal factored stages to capture the hierarchical spatial +and temporal features through the Multi- Scale Convolution and Transformer +(MSC-Trans) hybrid block and Weight-shared Multi-Scale Transformer (WMS-Trans) +block. The seamless integration of these innovative designs results in a robust +spatio-temporal representation that outperforms state-of-the-art approaches on +RGB-D action and gesture recognition datasets. + +
+
+ comment: ACM MM'23 +
+
+
+
+
+ + ♻ ☆ Recursive Cross-View: Use Only 2D Detectors to Achieve 3D Object + Detection without 3D Annotations + + +
+ Heavily relying on 3D annotations limits the real-world application of 3D +object detection. In this paper, we propose a method that does not demand any +3D annotation, while being able to predict full-oriented 3D bounding boxes. Our +method, called Recursive Cross-View (RCV), transforms 3D detection into several +2D detection tasks, which only consume some 2D labels, based on the three-view +principle. We propose a recursive paradigm, in which instance segmentation and +3D bounding box generation by Cross-View are implemented recursively until +convergence. Specifically, a frustum is proposed via a 2D detector, followed by +the recursive paradigm that finally outputs a full-oriented 3D box, class, and +score. To justify that our method can be quickly used to new tasks in +real-world scenarios, we do three experiments, namely indoor 3D human +detection, full-oriented 3D hand detection, and real-time detection on a real +3D sensor. RCV achieves decent performance in these experiments. Once trained, +our method can be viewed as a 3D annotation tool. Consequently, we formulate +two 3D labeled dataset, namely '3D_HUMAN' and 'D_HAND', based on RCV, which +could be used to pre-train other 3D detectors. Furthermore, estimated on the +SUN RGB-D benchmark, our method achieves comparable performance with some full +3D supervised learning methods. RCV is the first 3D detection method that does +not consume 3D labels and yields full-oriented 3D boxes on point clouds. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Commonsense Knowledge Assisted Deep Learning with Application to + Size-Related Fine-Grained Object Detection + + +
+ In this paper, we focus on a scenario where a single image contains objects +of the same category but varying sizes, and we propose a lightweight approach +that can not only recognize their category labels but also their real sizes. +Our approach utilizes commonsense knowledge to assist a deep neural network +(DNN) based coarse-grained object detector to achieve accurate size-related +fine-grained detection. Specifically, we introduce a commonsense knowledge +inference module (CKIM) that maps the coarse-grained labels produced by the DL +detector to size-related fine-grained labels. Experimental results demonstrate +that our approach achieves accurate fine-grained detections with a reduced +amount of annotated data, and smaller model size, compared with baseline +methods. Our code is available at: https://github.com/ZJLAB-AMMI/CKIM. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ NICE: CVPR 2023 Challenge on Zero-shot Image Captioning + + +
+ In this report, we introduce NICE (New frontiers for zero-shot Image +Captioning Evaluation) project and share the results and outcomes of 2023 +challenge. This project is designed to challenge the computer vision community +to develop robust image captioning models that advance the state-of-the-art +both in terms of accuracy and fairness. Through the challenge, the image +captioning models were tested using a new evaluation dataset that includes a +large variety of visual concepts from many domains. There was no specific +training data provided for the challenge, and therefore the challenge entries +were required to adapt to new types of image descriptions that had not been +seen during training. This report includes information on the newly proposed +NICE dataset, evaluation methods, challenge results, and technical details of +top-ranking entries. We expect that the outcomes of the challenge will +contribute to the improvement of AI models on various vision-language tasks. + +
+
+ comment: Tech report, project page https://nice.lgresearch.ai/ +
+
+
+
+
+ + ♻ ☆ Cross-Modality Neuroimage Synthesis: A Survey + + +
+ The existence of completely aligned and paired multi-modal neuroimaging data +has proved its effectiveness in diagnosis of brain diseases. However, +collecting the full set of well-aligned and paired data is expensive or even +impractical, since the practical difficulties may include high cost, long time +acquisition, image corruption, and privacy issues. A realistic solution is to +explore either an unsupervised learning or a semi-supervised learning to +synthesize the absent neuroimaging data. In this paper, we are the first one to +comprehensively approach cross-modality neuroimage synthesis task from +different perspectives, which include the level of the supervision (especially +for weakly-supervised and unsupervised), loss function, evaluation metrics, the +range of modality synthesis, datasets (aligned, private and public) and the +synthesis-based downstream tasks. To begin with, we highlight several opening +challenges for cross-modality neuroimage sysnthesis. Then we summarize the +architecture of cross-modality synthesis under various of supervision level. In +addition, we provide in-depth analysis of how cross-modality neuroimage +synthesis can improve the performance of different downstream tasks. Finally, +we re-evaluate the open challenges and point out the future directions for the +remaining challenges. All resources are available at +https://github.com/M-3LAB/awesome-multimodal-brain-image-systhesis + +
+
+
+
+
+ + ♻ ☆ GIFD: A Generative Gradient Inversion Method with Feature Domain + Optimization ICCV 2023 + + +
+ Federated Learning (FL) has recently emerged as a promising distributed +machine learning framework to preserve clients' privacy, by allowing multiple +clients to upload the gradients calculated from their local data to a central +server. Recent studies find that the exchanged gradients also take the risk of +privacy leakage, e.g., an attacker can invert the shared gradients and recover +sensitive data against an FL system by leveraging pre-trained generative +adversarial networks (GAN) as prior knowledge. However, performing gradient +inversion attacks in the latent space of the GAN model limits their expression +ability and generalizability. To tackle these challenges, we propose +\textbf{G}radient \textbf{I}nversion over \textbf{F}eature \textbf{D}omains +(GIFD), which disassembles the GAN model and searches the feature domains of +the intermediate layers. Instead of optimizing only over the initial latent +code, we progressively change the optimized layer, from the initial latent +space to intermediate layers closer to the output images. In addition, we +design a regularizer to avoid unreal image generation by adding a small ${l_1}$ +ball constraint to the searching range. We also extend GIFD to the +out-of-distribution (OOD) setting, which weakens the assumption that the +training sets of GANs and FL tasks obey the same data distribution. Extensive +experiments demonstrate that our method can achieve pixel-level reconstruction +and is superior to the existing methods. Notably, GIFD also shows great +generalizability under different defense strategy settings and batch sizes. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Over-Sampling Strategy in Feature Space for Graphs based + Class-imbalanced Bot Detection + + +
+ The presence of a large number of bots in Online Social Networks (OSN) leads +to undesirable social effects. Graph neural networks (GNNs) are effective in +detecting bots as they utilize user interactions. However, class-imbalanced +issues can affect bot detection performance. To address this, we propose an +over-sampling strategy for GNNs (OS-GNN) that generates samples for the +minority class without edge synthesis. First, node features are mapped to a +feature space through neighborhood aggregation. Then, we generate samples for +the minority class in the feature space. Finally, the augmented features are +used to train the classifiers. This framework is general and can be easily +extended into different GNN architectures. The proposed framework is evaluated +using three real-world bot detection benchmark datasets, and it consistently +exhibits superiority over the baselines. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the + Right Reasons in Latent Space + + +
+ Deep Neural Networks are prone to learning spurious correlations embedded in +the training data, leading to potentially biased predictions. This poses risks +when deploying these models for high-stake decision-making, such as in medical +applications. Current methods for post-hoc model correction either require +input-level annotations, which are only possible for spatially localized +biases, or augment the latent feature space, thereby hoping to enforce the +right reasons. We present a novel method ensuring the right reasons on the +concept level by reducing the model's sensitivity towards biases through the +gradient. When modeling biases via Concept Activation Vectors, we highlight the +importance of choosing robust directions, as traditional regression-based +approaches such as Support Vector Machines tend to result in diverging +directions. We effectively mitigate biases in controlled and real-world +settings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet +and EfficientNet architectures. + +
+
+
+
+
+ + ♻ ☆ Deep DA for Ordinal Regression of Pain Intensity Estimation Using + Weakly-Labeled Videos + + +
+ Automatic estimation of pain intensity from facial expressions in videos has +an immense potential in health care applications. However, domain adaptation +(DA) is needed to alleviate the problem of domain shifts that typically occurs +between video data captured in source and target do-mains. Given the laborious +task of collecting and annotating videos, and the subjective bias due to +ambiguity among adjacent intensity levels, weakly-supervised learning (WSL)is +gaining attention in such applications. Yet, most state-of-the-art WSL models +are typically formulated as regression problems, and do not leverage the +ordinal relation between intensity levels, nor the temporal coherence of +multiple consecutive frames. This paper introduces a new deep learn-ing model +for weakly-supervised DA with ordinal regression(WSDA-OR), where videos in +target domain have coarse la-bels provided on a periodic basis. The WSDA-OR +model enforces ordinal relationships among the intensity levels as-signed to +the target sequences, and associates multiple relevant frames to sequence-level +labels (instead of a single frame). In particular, it learns discriminant and +domain-invariant feature representations by integrating multiple in-stance +learning with deep adversarial DA, where soft Gaussian labels are used to +efficiently represent the weak ordinal sequence-level labels from the target +domain. The proposed approach was validated on the RECOLA video dataset as +fully-labeled source domain, and UNBC-McMaster video data as weakly-labeled +target domain. We have also validated WSDA-OR on BIOVID and Fatigue (private) +datasets for sequence level estimation. Experimental results indicate that our +approach can provide a significant improvement over the state-of-the-art +models, allowing to achieve a greater localization accuracy. + +
+
+ comment: due to multiple submission +
+
+
+
+
+ + ♻ ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ♻ ☆ GTNet:Guided Transformer Network for Detecting Human-Object Interactions SP + + +
+ The human-object interaction (HOI) detection task refers to localizing +humans, localizing objects, and predicting the interactions between each +human-object pair. HOI is considered one of the fundamental steps in truly +understanding complex visual scenes. For detecting HOI, it is important to +utilize relative spatial configurations and object semantics to find salient +spatial regions of images that highlight the interactions between human object +pairs. This issue is addressed by the novel self-attention based guided +transformer network, GTNet. GTNet encodes this spatial contextual information +in human and object visual features via self-attention while achieving state of +the art results on both the V-COCO and HICO-DET datasets. Code will be made +available online. + +
+
+ comment: accepted for presentation in Pattern Recognition and Tracking XXXIV + at SPIE commerce+ defence Program +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Learning for Facial Behavior Analysis : A Review + + +
+ In the recent years, there has been a shift in facial behavior analysis from +the laboratory-controlled conditions to the challenging in-the-wild conditions +due to the superior performance of deep learning based approaches for many real +world applications.However, the performance of deep learning approaches relies +on the amount of training data. One of the major problems with data acquisition +is the requirement of annotations for large amount of training data. Labeling +process of huge training data demands lot of human support with strong domain +expertise for facial expressions or action units, which is difficult to obtain +in real-time environments.Moreover, labeling process is highly vulnerable to +ambiguity of expressions or action units, especially for intensities due to the +bias induced by the domain experts. Therefore, there is an imperative need to +address the problem of facial behavior analysis with weak annotations. In this +paper, we provide a comprehensive review of weakly supervised learning (WSL) +approaches for facial behavior analysis with both categorical as well as +dimensional labels along with the challenges and potential research directions +associated with it. First, we introduce various types of weak annotations in +the context of facial behavior analysis and the corresponding challenges +associated with it. We then systematically review the existing state-of-the-art +approaches and provide a taxonomy of these approaches along with their insights +and limitations. In addition, widely used data-sets in the reviewed literature +and the performance of these approaches along with evaluation principles are +summarized. Finally, we discuss the remaining challenges and opportunities +along with the potential research directions in order to apply facial behavior +analysis with weak labels in real life situations. + +
+
+
+
+
+ + ♻ ☆ Multimodal Transformer for Material Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different combinations of +four different modalities: RGB, Angle of Linear Polarization (AoLP), Degree of +Linear Polarization (DoLP) and Near-Infrared (NIR). We also propose a new model +named Multi-Modal Segmentation Transformer (MMSFormer) that incorporates the +proposed fusion strategy to perform multimodal material segmentation. MMSFormer +achieves 52.05% mIoU outperforming the current state-of-the-art on Multimodal +Material Segmentation (MCubeS) dataset. For instance, our method provides +significant improvement in detecting gravel (+10.4%) and human (+9.1%) classes. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Perpetual Humanoid Control for Real-time Simulated Avatars ICCV 2023 + + +
+ We present a physics-based humanoid controller that achieves high-fidelity +motion imitation and fault-tolerant behavior in the presence of noisy input +(e.g. pose estimates from video or generated from language) and unexpected +falls. Our controller scales up to learning ten thousand motion clips without +using any external stabilizing forces and learns to naturally recover from +fail-state. Given reference motion, our controller can perpetually control +simulated avatars without requiring resets. At its core, we propose the +progressive multiplicative control policy (PMCP), which dynamically allocates +new network capacity to learn harder and harder motion sequences. PMCP allows +efficient scaling for learning from large-scale motion databases and adding new +tasks, such as fail-state recovery, without catastrophic forgetting. We +demonstrate the effectiveness of our controller by using it to imitate noisy +poses from video-based pose estimators and language-based motion generators in +a live and real-time multi-person avatar use case. + +
+
+ comment: ICCV 2023. Project page: https://zhengyiluo.github.io/PHC/ +
+
+
+
+
+ + ♻ ☆ PatchRefineNet: Improving Binary Segmentation by Incorporating Signals + from Optimal Patch-wise Binarization + + +
+ The purpose of binary segmentation models is to determine which pixels belong +to an object of interest (e.g., which pixels in an image are part of roads). +The models assign a logit score (i.e., probability) to each pixel and these are +converted into predictions by thresholding (i.e., each pixel with logit score +$\geq \tau$ is predicted to be part of a road). However, a common phenomenon in +current and former state-of-the-art segmentation models is spatial bias -- in +some patches, the logit scores are consistently biased upwards and in others +they are consistently biased downwards. These biases cause false positives and +false negatives in the final predictions. In this paper, we propose +PatchRefineNet (PRN), a small network that sits on top of a base segmentation +model and learns to correct its patch-specific biases. Across a wide variety of +base models, PRN consistently helps them improve mIoU by 2-3\%. One of the key +ideas behind PRN is the addition of a novel supervision signal during training. +Given the logit scores produced by the base segmentation model, each pixel is +given a pseudo-label that is obtained by optimally thresholding the logit +scores in each image patch. Incorporating these pseudo-labels into the loss +function of PRN helps correct systematic biases and reduce false +positives/negatives. Although we mainly focus on binary segmentation, we also +show how PRN can be extended to saliency detection and few-shot segmentation. +We also discuss how the ideas can be extended to multiclass segmentation. + +
+
+ comment: 16 pages, 12 figures, 7 tables (Added supplementary material) +
+
+
+
+
+
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ D2WFP: A Novel Protocol for Forensically Identifying, Extracting, and + Analysing Deep and Dark Web Browsing Activities + + +
+ The use of the un-indexed web, commonly known as the deep web and dark web, +to commit or facilitate criminal activity has drastically increased over the +past decade. The dark web is an in-famously dangerous place where all kinds of +criminal activities take place [1-2], despite advances in web forensics +techniques, tools, and methodologies, few studies have formally tackled the +dark and deep web forensics and the technical differences in terms of +investigative techniques and artefacts identification and extraction. This +research proposes a novel and comprehensive protocol to guide and assist +digital forensics professionals in investigating crimes committed on or via the +deep and dark web, The protocol named D2WFP establishes a new sequential +approach for performing investigative activities by observing the order of +volatility and implementing a systemic approach covering all browsing related +hives and artefacts which ultimately resulted into improv-ing the accuracy and +effectiveness. Rigorous quantitative and qualitative research has been +conducted by assessing D2WFP following a scientifically-sound and comprehensive +process in different scenarios and the obtained results show an apparent +increase in the number of artefacts re-covered when adopting D2WFP which +outperform any current industry or opensource browsing forensics tools. The +second contribution of D2WFP is the robust formulation of artefact correlation +and cross-validation within D2WFP which enables digital forensics professionals +to better document and structure their analysis of host-based deep and dark web +browsing artefacts. + +
+
+
+
+
+ + ☆ Re-formalization of Individual Fairness + + +
+ The notion of individual fairness is a formalization of an ethical principle, +"Treating like cases alike," which has been argued such as by Aristotle. In a +fairness-aware machine learning context, Dwork et al. firstly formalized the +notion. In their formalization, a similar pair of data in an unfair space +should be mapped to similar positions in a fair space. We propose to +re-formalize individual fairness by the statistical independence conditioned by +individuals. This re-formalization has the following merits. First, our +formalization is compatible with that of Dwork et al. Second, our formalization +enables to combine individual fairness with the fairness notion, equalized odds +or sufficiency, as well as statistical parity. Third, though their +formalization implicitly assumes a pre-process approach for making fair +prediction, our formalization is applicable to an in-process or post-process +approach. + +
+
+ comment: Published at the 6th FAccTRec Workshop: Responsible Recommendation +
+
+
+
+
+ + ☆ Towards Content-based Pixel Retrieval in Revisited Oxford and Paris + + +
+ This paper introduces the first two pixel retrieval benchmarks. Pixel +retrieval is segmented instance retrieval. Like semantic segmentation extends +classification to the pixel level, pixel retrieval is an extension of image +retrieval and offers information about which pixels are related to the query +object. In addition to retrieving images for the given query, it helps users +quickly identify the query object in true positive images and exclude false +positive images by denoting the correlated pixels. Our user study results show +pixel-level annotation can significantly improve the user experience. + Compared with semantic and instance segmentation, pixel retrieval requires a +fine-grained recognition capability for variable-granularity targets. To this +end, we propose pixel retrieval benchmarks named PROxford and PRParis, which +are based on the widely used image retrieval datasets, ROxford and RParis. +Three professional annotators label 5,942 images with two rounds of +double-checking and refinement. Furthermore, we conduct extensive experiments +and analysis on the SOTA methods in image search, image matching, detection, +segmentation, and dense matching using our pixel retrieval benchmarks. Results +show that the pixel retrieval task is challenging to these approaches and +distinctive from existing problems, suggesting that further research can +advance the content-based pixel-retrieval and thus user search experience. The +datasets can be downloaded from +\href{https://github.com/anguoyuan/Pixel_retrieval-Segmented_instance_retrieval}{this +link}. + +
+
+
+
+
+ + ☆ Formalizing Multimedia Recommendation through Multimodal Deep Learning + + +
+ Recommender systems (RSs) offer personalized navigation experiences on online +platforms, but recommendation remains a challenging task, particularly in +specific scenarios and domains. Multimodality can help tap into richer +information sources and construct more refined user/item profiles for +recommendations. However, existing literature lacks a shared and universal +schema for modeling and solving the recommendation problem through the lens of +multimodality. This work aims to formalize a general multimodal schema for +multimedia recommendation. It provides a comprehensive literature review of +multimodal approaches for multimedia recommendation from the last eight years, +outlines the theoretical foundations of a multimodal pipeline, and demonstrates +its rationale by applying it to selected state-of-the-art approaches. The work +also conducts a benchmarking analysis of recent algorithms for multimedia +recommendation within Elliot, a rigorous framework for evaluating recommender +systems. The main aim is to provide guidelines for designing and implementing +the next generation of multimodal approaches in multimedia recommendation. + +
+
+
+
+
+ + ☆ Generating Natural Language Queries for More Effective Systematic Review + Screening Prioritisation SIGIR + + +
+ Screening prioritisation in medical systematic reviews aims to rank the set +of documents retrieved by complex Boolean queries. The goal is to prioritise +the most important documents so that subsequent review steps can be carried out +more efficiently and effectively. The current state of the art uses the final +title of the review to rank documents using BERT-based neural neural rankers. +However, the final title is only formulated at the end of the review process, +which makes this approach impractical as it relies on ex post facto +information. At the time of screening, only a rough working title is available, +with which the BERT-based ranker achieves is significantly worse than the final +title. In this paper, we explore alternative sources of queries for screening +prioritisation, such as the Boolean query used to retrieve the set of documents +to be screened, and queries generated by instruction-based generative large +language models such as ChatGPT and Alpaca. Our best approach is not only +practical based on the information available at screening time, but is similar +in effectiveness with the final title. + +
+
+ comment: Preprints for Accepted paper in SIGIR-AP-2023 +
+
+
+
+
+ + ☆ Generalized Rainbow Differential Privacy + + +
+ We study a new framework for designing differentially private (DP) mechanisms +via randomized graph colorings, called rainbow differential privacy. In this +framework, datasets are nodes in a graph, and two neighboring datasets are +connected by an edge. Each dataset in the graph has a preferential ordering for +the possible outputs of the mechanism, and these orderings are called rainbows. +Different rainbows partition the graph of connected datasets into different +regions. We show that if a DP mechanism at the boundary of such regions is +fixed and it behaves identically for all same-rainbow boundary datasets, then a +unique optimal $(\epsilon,\delta)$-DP mechanism exists (as long as the boundary +condition is valid) and can be expressed in closed-form. Our proof technique is +based on an interesting relationship between dominance ordering and DP, which +applies to any finite number of colors and for $(\epsilon,\delta)$-DP, +improving upon previous results that only apply to at most three colors and for +$\epsilon$-DP. We justify the homogeneous boundary condition assumption by +giving an example with non-homogeneous boundary condition, for which there +exists no optimal DP mechanism. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2202.03974 +
+
+
+
+
+ + ☆ Stringesthesia: Dynamically Shifting Musical Agency Between Audience and + Performer Based on Trust in an Interactive and Improvised Performance + + +
+ This paper introduces Stringesthesia, an interactive and improvised +performance paradigm. Stringesthesia uses real-time neuroimaging to connect +performers and audiences, enabling direct access to the performers mental state +and determining audience participation during the performance. Functional +near-infrared spectroscopy, or fNIRS, a noninvasive neuroimaging tool, was used +to assess metabolic activity of brain areas collectively associated with a +metric we call trust. A visualization representing the real-time measurement of +the performers level of trust was projected behind the performer and used to +dynamically restrict or promote audience participation. Throughout the paper we +discuss prior work that heavily influenced our design, conceptual and +methodological issues with using fNIRS technology, system architecture, and +feedback from the audience and performer. + +
+
+
+
+
+ + ♻ ☆ Financial News Analytics Using Fine-Tuned Llama 2 GPT Model + + +
+ The paper considers the possibility to fine-tune Llama 2 GPT large language +model (LLM) for the multitask analysis of financial news. For fine-tuning, the +PEFT/LoRA based approach was used. In the study, the model was fine-tuned for +the following tasks: analysing a text from financial market perspectives, +highlighting main points of a text, summarizing a text and extracting named +entities with appropriate sentiments. The obtained results show that the +fine-tuned Llama 2 model can perform a multitask financial news analysis with a +specified structure of response, part of response can be a structured text and +another part of data can have JSON format for further processing. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models with quantitative target variables. + +
+
+
+
+
+
+
+
+ + Machine Learning 139 + +
+
+
+ + ☆ Robot Parkour Learning + + +
+ Parkour is a grand challenge for legged locomotion that requires robots to +overcome various obstacles rapidly in complex environments. Existing methods +can generate either diverse but blind locomotion skills or vision-based but +specialized skills by using reference animal data or complex rewards. However, +autonomous parkour requires robots to learn generalizable skills that are both +vision-based and diverse to perceive and react to various scenarios. In this +work, we propose a system for learning a single end-to-end vision-based parkour +policy of diverse parkour skills using a simple reward without any reference +motion data. We develop a reinforcement learning method inspired by direct +collocation to generate parkour skills, including climbing over high obstacles, +leaping over large gaps, crawling beneath low barriers, squeezing through thin +slits, and running. We distill these skills into a single vision-based parkour +policy and transfer it to a quadrupedal robot using its egocentric depth +camera. We demonstrate that our system can empower two different low-cost +robots to autonomously select and execute appropriate parkour skills to +traverse challenging real-world environments. + +
+
+ comment: CoRL 2023 (Oral). Project website at https://robot-parkour.github.io +
+
+
+
+
+ + ☆ Hypothesis Search: Inductive Reasoning with Language Models + + +
+ Inductive reasoning is a core problem-solving capacity: humans can identify +underlying principles from a few examples, which can then be robustly +generalized to novel scenarios. Recent work has evaluated large language models +(LLMs) on inductive reasoning tasks by directly prompting them yielding "in +context learning." This can work well for straightforward inductive tasks, but +performs very poorly on more complex tasks such as the Abstraction and +Reasoning Corpus (ARC). In this work, we propose to improve the inductive +reasoning ability of LLMs by generating explicit hypotheses at multiple levels +of abstraction: we prompt the LLM to propose multiple abstract hypotheses about +the problem, in natural language, then implement the natural language +hypotheses as concrete Python programs. These programs can be directly verified +by running on the observed examples and generalized to novel inputs. Because of +the prohibitive cost of generation with state-of-the-art LLMs, we consider a +middle step to filter the set of hypotheses that will be implemented into +programs: we either ask the LLM to summarize into a smaller set of hypotheses, +or ask human annotators to select a subset of the hypotheses. We verify our +pipeline's effectiveness on the ARC visual inductive reasoning benchmark, its +variant 1D-ARC, and string transformation dataset SyGuS. On a random 40-problem +subset of ARC, our automated pipeline using LLM summaries achieves 27.5% +accuracy, significantly outperforming the direct prompting baseline (accuracy +of 12.5%). With the minimal human input of selecting from LLM-generated +candidates, the performance is boosted to 37.5%. (And we argue this is a lower +bound on the performance of our approach without filtering.) Our ablation +studies show that abstract hypothesis generation and concrete program +representations are both beneficial for LLMs to perform inductive reasoning +tasks. + +
+
+
+
+
+ + ☆ On the quality of randomized approximations of Tukey's depth + + +
+ Tukey's depth (or halfspace depth) is a widely used measure of centrality for +multivariate data. However, exact computation of Tukey's depth is known to be a +hard problem in high dimensions. As a remedy, randomized approximations of +Tukey's depth have been proposed. In this paper we explore when such randomized +algorithms return a good approximation of Tukey's depth. We study the case when +the data are sampled from a log-concave isotropic distribution. We prove that, +if one requires that the algorithm runs in polynomial time in the dimension, +the randomized algorithm correctly approximates the maximal depth $1/2$ and +depths close to zero. On the other hand, for any point of intermediate depth, +any good approximation requires exponential complexity. + +
+
+
+
+
+ + ☆ Dynamic Handover: Throw and Catch with Bimanual Hands + + +
+ Humans throw and catch objects all the time. However, such a seemingly common +skill introduces a lot of challenges for robots to achieve: The robots need to +operate such dynamic actions at high-speed, collaborate precisely, and interact +with diverse objects. In this paper, we design a system with two multi-finger +hands attached to robot arms to solve this problem. We train our system using +Multi-Agent Reinforcement Learning in simulation and perform Sim2Real transfer +to deploy on the real robots. To overcome the Sim2Real gap, we provide multiple +novel algorithm designs including learning a trajectory prediction model for +the object. Such a model can help the robot catcher has a real-time estimation +of where the object will be heading, and then react accordingly. We conduct our +experiments with multiple objects in the real-world system, and show +significant improvements over multiple baselines. Our project page is available +at \url{https://binghao-huang.github.io/dynamic_handover/}. + +
+
+ comment: Accepted at CoRL 2023. + https://binghao-huang.github.io/dynamic_handover/ +
+
+
+
+
+ + ☆ A Novel Supervised Deep Learning Solution to Detect Distributed Denial + of Service (DDoS) attacks on Edge Systems using Convolutional Neural Networks + (CNN) + + +
+ Cybersecurity attacks are becoming increasingly sophisticated and pose a +growing threat to individuals, and private and public sectors. Distributed +Denial of Service attacks are one of the most harmful of these threats in +today's internet, disrupting the availability of essential services. This +project presents a novel deep learning-based approach for detecting DDoS +attacks in network traffic using the industry-recognized DDoS evaluation +dataset from the University of New Brunswick, which contains packet captures +from real-time DDoS attacks, creating a broader and more applicable model for +the real world. The algorithm employed in this study exploits the properties of +Convolutional Neural Networks (CNN) and common deep learning algorithms to +build a novel mitigation technique that classifies benign and malicious +traffic. The proposed model preprocesses the data by extracting packet flows +and normalizing them to a fixed length which is fed into a custom architecture +containing layers regulating node dropout, normalization, and a sigmoid +activation function to out a binary classification. This allows for the model +to process the flows effectively and look for the nodes that contribute to DDoS +attacks while dropping the "noise" or the distractors. The results of this +study demonstrate the effectiveness of the proposed algorithm in detecting DDOS +attacks, achieving an accuracy of .9883 on 2000 unseen flows in network +traffic, while being scalable for any network environment. + +
+
+
+
+
+ + ☆ Boundary Peeling: Outlier Detection Method Using One-Class Peeling + + +
+ Unsupervised outlier detection constitutes a crucial phase within data +analysis and remains a dynamic realm of research. A good outlier detection +algorithm should be computationally efficient, robust to tuning parameter +selection, and perform consistently well across diverse underlying data +distributions. We introduce One-Class Boundary Peeling, an unsupervised outlier +detection algorithm. One-class Boundary Peeling uses the average signed +distance from iteratively-peeled, flexible boundaries generated by one-class +support vector machines. One-class Boundary Peeling has robust hyperparameter +settings and, for increased flexibility, can be cast as an ensemble method. In +synthetic data simulations One-Class Boundary Peeling outperforms all state of +the art methods when no outliers are present while maintaining comparable or +superior performance in the presence of outliers, as compared to benchmark +methods. One-Class Boundary Peeling performs competitively in terms of correct +classification, AUC, and processing time using common benchmark data sets. + +
+
+
+
+
+ + ☆ Privacy Side Channels in Machine Learning Systems + + +
+ Most current approaches for protecting privacy in machine learning (ML) +assume that models exist in a vacuum, when in reality, ML models are part of +larger systems that include components for training data filtering, output +monitoring, and more. In this work, we introduce privacy side channels: attacks +that exploit these system-level components to extract private information at +far higher rates than is otherwise possible for standalone models. We propose +four categories of side channels that span the entire ML lifecycle (training +data filtering, input preprocessing, output post-processing, and query +filtering) and allow for either enhanced membership inference attacks or even +novel threats such as extracting users' test queries. For example, we show that +deduplicating training data before applying differentially-private training +creates a side-channel that completely invalidates any provable privacy +guarantees. Moreover, we show that systems which block language models from +regenerating training data can be exploited to allow exact reconstruction of +private keys contained in the training set -- even if the model did not +memorize these keys. Taken together, our results demonstrate the need for a +holistic, end-to-end privacy analysis of machine learning. + +
+
+
+
+
+ + ☆ Memory Injections: Correcting Multi-Hop Reasoning Failures during + Inference in Transformer-Based Language Models + + +
+ Answering multi-hop reasoning questions requires retrieving and synthesizing +information from diverse sources. Large Language Models (LLMs) struggle to +perform such reasoning consistently. Here we propose an approach to pinpoint +and rectify multi-hop reasoning failures through targeted memory injections on +LLM attention heads. First, we analyze the per-layer activations of GPT-2 +models in response to single and multi-hop prompts. We then propose a mechanism +that allows users to inject pertinent prompt-specific information, which we +refer to as "memories," at critical LLM locations during inference. By thus +enabling the LLM to incorporate additional relevant information during +inference, we enhance the quality of multi-hop prompt completions. We show +empirically that a simple, efficient, and targeted memory injection into a key +attention layer can often increase the probability of the desired next token in +multi-hop tasks, by up to 424%. + +
+
+
+
+
+ + ☆ Quantitative Analysis of Forecasting Models:In the Aspect of Online + Political Bias ICML + + +
+ Understanding and mitigating political bias in online social media platforms +are crucial tasks to combat misinformation and echo chamber effects. However, +characterizing political bias temporally using computational methods presents +challenges due to the high frequency of noise in social media datasets. While +existing research has explored various approaches to political bias +characterization, the ability to forecast political bias and anticipate how +political conversations might evolve in the near future has not been +extensively studied. In this paper, we propose a heuristic approach to classify +social media posts into five distinct political leaning categories. Since there +is a lack of prior work on forecasting political bias, we conduct an in-depth +analysis of existing baseline models to identify which model best fits to +forecast political leaning time series. Our approach involves utilizing +existing time series forecasting models on two social media datasets with +different political ideologies, specifically Twitter and Gab. Through our +experiments and analyses, we seek to shed light on the challenges and +opportunities in forecasting political bias in social media platforms. +Ultimately, our work aims to pave the way for developing more effective +strategies to mitigate the negative impact of political bias in the digital +realm. + +
+
+ comment: This is a preprint version of a paper that is accepted to be + presented as a poster at the ICMLA conference on December 15-17 2023 +
+
+
+
+
+ + ☆ Mind the Uncertainty: Risk-Aware and Actively Exploring Model-Based + Reinforcement Learning + + +
+ We introduce a simple but effective method for managing risk in model-based +reinforcement learning with trajectory sampling that involves probabilistic +safety constraints and balancing of optimism in the face of epistemic +uncertainty and pessimism in the face of aleatoric uncertainty of an ensemble +of stochastic neural networks.Various experiments indicate that the separation +of uncertainties is essential to performing well with data-driven MPC +approaches in uncertain and safety-critical control environments. + +
+
+
+
+
+ + ☆ Anisotropic Diffusion Stencils: From Simple Derivations over Stability + Estimates to ResNet Implementations + + +
+ Anisotropic diffusion processes with a diffusion tensor are important in +image analysis, physics, and engineering. However, their numerical +approximation has a strong impact on dissipative artefacts and deviations from +rotation invariance. In this work, we study a large family of finite difference +discretisations on a 3 x 3 stencil. We derive it by splitting 2-D anisotropic +diffusion into four 1-D diffusions. The resulting stencil class involves one +free parameter and covers a wide range of existing discretisations. It +comprises the full stencil family of Weickert et al. (2013) and shows that +their two parameters contain redundancy. Furthermore, we establish a bound on +the spectral norm of the matrix corresponding to the stencil. This gives time +step size limits that guarantee stability of an explicit scheme in the +Euclidean norm. Our directional splitting also allows a very natural +translation of the explicit scheme into ResNet blocks. Employing neural network +libraries enables simple and highly efficient parallel implementations on GPUs. + +
+
+
+
+
+ + ☆ ITI-GEN: Inclusive Text-to-Image Generation ICCV 2023 + + +
+ Text-to-image generative models often reflect the biases of the training +data, leading to unequal representations of underrepresented groups. This study +investigates inclusive text-to-image generative models that generate images +based on human-written prompts and ensure the resulting images are uniformly +distributed across attributes of interest. Unfortunately, directly expressing +the desired attributes in the prompt often leads to sub-optimal results due to +linguistic ambiguity or model misrepresentation. Hence, this paper proposes a +drastically different approach that adheres to the maxim that "a picture is +worth a thousand words". We show that, for some attributes, images can +represent concepts more expressively than text. For instance, categories of +skin tones are typically hard to specify by text but can be easily represented +by example images. Building upon these insights, we propose a novel approach, +ITI-GEN, that leverages readily available reference images for Inclusive +Text-to-Image GENeration. The key idea is learning a set of prompt embeddings +to generate images that can effectively represent all desired attribute +categories. More importantly, ITI-GEN requires no model fine-tuning, making it +computationally efficient to augment existing text-to-image models. Extensive +experiments demonstrate that ITI-GEN largely improves over state-of-the-art +models to generate inclusive images from a prompt. Project page: +https://czhang0528.github.io/iti-gen. + +
+
+ comment: Accepted to ICCV 2023 (Oral Presentation) +
+
+
+
+
+ + ☆ Distance-Aware eXplanation Based Learning ICTAI 2023 + + +
+ eXplanation Based Learning (XBL) is an interactive learning approach that +provides a transparent method of training deep learning models by interacting +with their explanations. XBL augments loss functions to penalize a model based +on deviation of its explanations from user annotation of image features. The +literature on XBL mostly depends on the intersection of visual model +explanations and image feature annotations. We present a method to add a +distance-aware explanation loss to categorical losses that trains a learner to +focus on important regions of a training dataset. Distance is an appropriate +approach for calculating explanation loss since visual model explanations such +as Gradient-weighted Class Activation Mapping (Grad-CAMs) are not strictly +bounded as annotations and their intersections may not provide complete +information on the deviation of a model's focus from relevant image regions. In +addition to assessing our model using existing metrics, we propose an +interpretability metric for evaluating visual feature-attribution based model +explanations that is more informative of the model's performance than existing +metrics. We demonstrate performance of our proposed method on three image +classification tasks. + +
+
+ comment: Accepted at the 35th IEEE International Conference on Tools with + Artificial Intelligence, ICTAI 2023 +
+
+
+
+
+ + ☆ Advancing Federated Learning in 6G: A Trusted Architecture with + Graph-based Analysis + + +
+ Integrating native AI support into the network architecture is an essential +objective of 6G. Federated Learning (FL) emerges as a potential paradigm, +facilitating decentralized AI model training across a diverse range of devices +under the coordination of a central server. However, several challenges hinder +its wide application in the 6G context, such as malicious attacks and privacy +snooping on local model updates, and centralization pitfalls. This work +proposes a trusted architecture for supporting FL, which utilizes Distributed +Ledger Technology (DLT) and Graph Neural Network (GNN), including three key +features. First, a pre-processing layer employing homomorphic encryption is +incorporated to securely aggregate local models, preserving the privacy of +individual models. Second, given the distributed nature and graph structure +between clients and nodes in the pre-processing layer, GNN is leveraged to +identify abnormal local models, enhancing system security. Third, DLT is +utilized to decentralize the system by selecting one of the candidates to +perform the central server's functions. Additionally, DLT ensures reliable data +management by recording data exchanges in an immutable and transparent ledger. +The feasibility of the novel architecture is validated through simulations, +demonstrating improved performance in anomalous model detection and global +model accuracy compared to relevant baselines. + +
+
+
+
+
+ + ☆ Re-formalization of Individual Fairness + + +
+ The notion of individual fairness is a formalization of an ethical principle, +"Treating like cases alike," which has been argued such as by Aristotle. In a +fairness-aware machine learning context, Dwork et al. firstly formalized the +notion. In their formalization, a similar pair of data in an unfair space +should be mapped to similar positions in a fair space. We propose to +re-formalize individual fairness by the statistical independence conditioned by +individuals. This re-formalization has the following merits. First, our +formalization is compatible with that of Dwork et al. Second, our formalization +enables to combine individual fairness with the fairness notion, equalized odds +or sufficiency, as well as statistical parity. Third, though their +formalization implicitly assumes a pre-process approach for making fair +prediction, our formalization is applicable to an in-process or post-process +approach. + +
+
+ comment: Published at the 6th FAccTRec Workshop: Responsible Recommendation +
+
+
+
+
+ + ☆ NExT-GPT: Any-to-Any Multimodal LLM + + +
+ While recently Multimodal Large Language Models (MM-LLMs) have made exciting +strides, they mostly fall prey to the limitation of only input-side multimodal +understanding, without the ability to produce content in multiple modalities. +As we humans always perceive the world and communicate with people through +various modalities, developing any-to-any MM-LLMs capable of accepting and +delivering content in any modality becomes essential to human-level AI. To fill +the gap, we present an end-to-end general-purpose any-to-any MM-LLM system, +NExT-GPT. We connect an LLM with multimodal adaptors and different diffusion +decoders, enabling NExT-GPT to perceive inputs and generate outputs in +arbitrary combinations of text, images, videos, and audio. By leveraging the +existing well-trained highly-performing encoders and decoders, NExT-GPT is +tuned with only a small amount of parameter (1%) of certain projection layers, +which not only benefits low-cost training and also facilitates convenient +expansion to more potential modalities. Moreover, we introduce a +modality-switching instruction tuning (MosIT) and manually curate a +high-quality dataset for MosIT, based on which NExT-GPT is empowered with +complex cross-modal semantic understanding and content generation. Overall, our +research showcases the promising possibility of building an AI agent capable of +modeling universal modalities, paving the way for more human-like AI research +in the community. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Stream-based Active Learning by Exploiting Temporal Properties in + Perception with Temporal Predicted Loss + + +
+ Active learning (AL) reduces the amount of labeled data needed to train a +machine learning model by intelligently choosing which instances to label. +Classic pool-based AL requires all data to be present in a datacenter, which +can be challenging with the increasing amounts of data needed in deep learning. +However, AL on mobile devices and robots, like autonomous cars, can filter the +data from perception sensor streams before reaching the datacenter. We +exploited the temporal properties for such image streams in our work and +proposed the novel temporal predicted loss (TPL) method. To evaluate the +stream-based setting properly, we introduced the GTA V streets and the A2D2 +streets dataset and made both publicly available. Our experiments showed that +our approach significantly improves the diversity of the selection while being +an uncertainty-based method. As pool-based approaches are more common in +perception applications, we derived a concept for comparing pool-based and +stream-based AL, where TPL out-performed state-of-the-art pool- or stream-based +approaches for different models. TPL demonstrated a gain of 2.5 precept points +(pp) less required data while being significantly faster than pool-based +methods. + +
+
+
+
+
+ + ☆ Optimize Weight Rounding via Signed Gradient Descent for the + Quantization of LLMs + + +
+ Large Language Models (LLMs) have proven their exceptional capabilities in +performing language-related tasks. However, their deployment poses significant +challenges due to their considerable memory and storage requirements. In +response to this issue, weight-only quantization, particularly 3 and 4-bit +weight-only quantization, has emerged as one of the most viable solutions. As +the number of bits decreases, the quantization grid broadens, thus emphasizing +the importance of up and down rounding. While previous studies have +demonstrated that fine-tuning up and down rounding with the addition of +perturbations can enhance accuracy in some scenarios, our study is driven by +the precise and limited boundary of these perturbations, where only the +threshold for altering the rounding value is of significance. Consequently, we +propose a concise and highly effective approach for optimizing the weight +rounding task. Our method, named SignRound, involves lightweight block-wise +tuning using signed gradient descent, enabling us to achieve outstanding +results within 400 steps. SignRound outperforms the established baseline of +rounding-to-nearest (RTN) and competes impressively against recent methods, +without introducing additional inference overhead. The source code will be +publicly available at https://github.com/intel/neural-compressor soon. + +
+
+
+
+
+ + ☆ Share Your Representation Only: Guaranteed Improvement of the + Privacy-Utility Tradeoff in Federated Learning ICLR 2023 + + +
+ Repeated parameter sharing in federated learning causes significant +information leakage about private data, thus defeating its main purpose: data +privacy. Mitigating the risk of this information leakage, using state of the +art differentially private algorithms, also does not come for free. Randomized +mechanisms can prevent convergence of models on learning even the useful +representation functions, especially if there is more disagreement between +local models on the classification functions (due to data heterogeneity). In +this paper, we consider a representation federated learning objective that +encourages various parties to collaboratively refine the consensus part of the +model, with differential privacy guarantees, while separately allowing +sufficient freedom for local personalization (without releasing it). We prove +that in the linear representation setting, while the objective is non-convex, +our proposed new algorithm \DPFEDREP\ converges to a ball centered around the +\emph{global optimal} solution at a linear rate, and the radius of the ball is +proportional to the reciprocal of the privacy budget. With this novel utility +analysis, we improve the SOTA utility-privacy trade-off for this problem by a +factor of $\sqrt{d}$, where $d$ is the input dimension. We empirically evaluate +our method with the image classification task on CIFAR10, CIFAR100, and EMNIST, +and observe a significant performance improvement over the prior work under the +same small privacy budget. The code can be found in this link: +https://github.com/shenzebang/CENTAUR-Privacy-Federated-Representation-Learning. + +
+
+ comment: ICLR 2023 revised +
+
+
+
+
+ + ☆ Learning Semantic Segmentation with Query Points Supervision on Aerial + Images ICCV 2023 + + +
+ Semantic segmentation is crucial in remote sensing, where high-resolution +satellite images are segmented into meaningful regions. Recent advancements in +deep learning have significantly improved satellite image segmentation. +However, most of these methods are typically trained in fully supervised +settings that require high-quality pixel-level annotations, which are expensive +and time-consuming to obtain. In this work, we present a weakly supervised +learning algorithm to train semantic segmentation algorithms that only rely on +query point annotations instead of full mask labels. Our proposed approach +performs accurate semantic segmentation and improves efficiency by +significantly reducing the cost and time required for manual annotation. +Specifically, we generate superpixels and extend the query point labels into +those superpixels that group similar meaningful semantics. Then, we train +semantic segmentation models, supervised with images partially labeled with the +superpixels pseudo-labels. We benchmark our weakly supervised training approach +on an aerial image dataset and different semantic segmentation architectures, +showing that we can reach competitive performance compared to fully supervised +training while reducing the annotation effort. + +
+
+ comment: Paper presented at the LXCV workshop at ICCV 2023 +
+
+
+
+
+ + ☆ Learning Objective-Specific Active Learning Strategies with Attentive + Neural Processes ECML 2023 + + +
+ Pool-based active learning (AL) is a promising technology for increasing +data-efficiency of machine learning models. However, surveys show that +performance of recent AL methods is very sensitive to the choice of dataset and +training setting, making them unsuitable for general application. In order to +tackle this problem, the field Learning Active Learning (LAL) suggests to learn +the active learning strategy itself, allowing it to adapt to the given setting. +In this work, we propose a novel LAL method for classification that exploits +symmetry and independence properties of the active learning problem with an +Attentive Conditional Neural Process model. Our approach is based on learning +from a myopic oracle, which gives our model the ability to adapt to +non-standard objectives, such as those that do not equally weight the error on +all data points. We experimentally verify that our Neural Process model +outperforms a variety of baselines in these settings. Finally, our experiments +show that our model exhibits a tendency towards improved stability to changing +datasets. However, performance is sensitive to choice of classifier and more +work is necessary to reduce the performance the gap with the myopic oracle and +to improve scalability. We present our work as a proof-of-concept for LAL on +nonstandard objectives and hope our analysis and modelling considerations +inspire future LAL work. + +
+
+ comment: Accepted at ECML 2023 +
+
+
+
+
+ + ☆ Machine learning the dimension of a Fano variety + + +
+ Fano varieties are basic building blocks in geometry - they are `atomic +pieces' of mathematical shapes. Recent progress in the classification of Fano +varieties involves analysing an invariant called the quantum period. This is a +sequence of integers which gives a numerical fingerprint for a Fano variety. It +is conjectured that a Fano variety is uniquely determined by its quantum +period. If this is true, one should be able to recover geometric properties of +a Fano variety directly from its quantum period. We apply machine learning to +the question: does the quantum period of X know the dimension of X? Note that +there is as yet no theoretical understanding of this. We show that a simple +feed-forward neural network can determine the dimension of X with 98% accuracy. +Building on this, we establish rigorous asymptotics for the quantum periods of +a class of Fano varieties. These asymptotics determine the dimension of X from +its quantum period. Our results demonstrate that machine learning can pick out +structure from complex mathematical data in situations where we lack +theoretical understanding. They also give positive evidence for the conjecture +that the quantum period of a Fano variety determines that variety. + +
+
+ comment: 28 pages, 5 tables, 23 figures. This version of the article has been + accepted for publication, after peer review but is not the Version of Record + and does not reflect post-acceptance improvements, or any corrections +
+
+
+
+
+ + ☆ Unveiling the Sentinels: Assessing AI Performance in Cybersecurity Peer + Review + + +
+ Peer review is the method employed by the scientific community for evaluating +research advancements. In the field of cybersecurity, the practice of +double-blind peer review is the de-facto standard. This paper touches on the +holy grail of peer reviewing and aims to shed light on the performance of AI in +reviewing for academic security conferences. Specifically, we investigate the +predictability of reviewing outcomes by comparing the results obtained from +human reviewers and machine-learning models. To facilitate our study, we +construct a comprehensive dataset by collecting thousands of papers from +renowned computer science conferences and the arXiv preprint website. Based on +the collected data, we evaluate the prediction capabilities of ChatGPT and a +two-stage classification approach based on the Doc2Vec model with various +classifiers. Our experimental evaluation of review outcome prediction using the +Doc2Vec-based approach performs significantly better than the ChatGPT and +achieves an accuracy of over 90%. While analyzing the experimental results, we +identify the potential advantages and limitations of the tested ML models. We +explore areas within the paper-reviewing process that can benefit from +automated support approaches, while also recognizing the irreplaceable role of +human intellect in certain aspects that cannot be matched by state-of-the-art +AI techniques. + +
+
+
+
+
+ + ☆ Diffusion-Based Co-Speech Gesture Generation Using Joint Text and Audio + Representation + + +
+ This paper describes a system developed for the GENEA (Generation and +Evaluation of Non-verbal Behaviour for Embodied Agents) Challenge 2023. Our +solution builds on an existing diffusion-based motion synthesis model. We +propose a contrastive speech and motion pretraining (CSMP) module, which learns +a joint embedding for speech and gesture with the aim to learn a semantic +coupling between these modalities. The output of the CSMP module is used as a +conditioning signal in the diffusion-based gesture synthesis model in order to +achieve semantically-aware co-speech gesture generation. Our entry achieved +highest human-likeness and highest speech appropriateness rating among the +submitted entries. This indicates that our system is a promising approach to +achieve human-like co-speech gestures in agents that carry semantic meaning. + +
+
+
+
+
+ + ☆ Pushing Mixture of Experts to the Limit: Extremely Parameter Efficient + MoE for Instruction Tuning + + +
+ The Mixture of Experts (MoE) is a widely known neural architecture where an +ensemble of specialized sub-models optimizes overall performance with a +constant computational cost. However, conventional MoEs pose challenges at +scale due to the need to store all experts in memory. In this paper, we push +MoE to the limit. We propose extremely parameter-efficient MoE by uniquely +combining MoE architecture with lightweight experts.Our MoE architecture +outperforms standard parameter-efficient fine-tuning (PEFT) methods and is on +par with full fine-tuning by only updating the lightweight experts -- less than +1% of an 11B parameters model. Furthermore, our method generalizes to unseen +tasks as it does not depend on any prior task knowledge. Our research +underscores the versatility of the mixture of experts architecture, showcasing +its ability to deliver robust performance even when subjected to rigorous +parameter constraints. Our code used in all the experiments is publicly +available here: https://github.com/for-ai/parameter-efficient-moe. + +
+
+
+
+
+ + ☆ Quantized Fourier and Polynomial Features for more Expressive Tensor + Network Models + + +
+ In the context of kernel machines, polynomial and Fourier features are +commonly used to provide a nonlinear extension to linear models by mapping the +data to a higher-dimensional space. Unless one considers the dual formulation +of the learning problem, which renders exact large-scale learning unfeasible, +the exponential increase of model parameters in the dimensionality of the data +caused by their tensor-product structure prohibits to tackle high-dimensional +problems. One of the possible approaches to circumvent this exponential scaling +is to exploit the tensor structure present in the features by constraining the +model weights to be an underparametrized tensor network. In this paper we +quantize, i.e. further tensorize, polynomial and Fourier features. Based on +this feature quantization we propose to quantize the associated model weights, +yielding quantized models. We show that, for the same number of model +parameters, the resulting quantized models have a higher bound on the +VC-dimension as opposed to their non-quantized counterparts, at no additional +computational cost while learning from identical features. We verify +experimentally how this additional tensorization regularizes the learning +problem by prioritizing the most salient features in the data and how it +provides models with increased generalization capabilities. We finally +benchmark our approach on large regression task, achieving state-of-the-art +results on a laptop computer. + +
+
+
+
+
+ + ☆ A parameterised model for link prediction using node centrality and + similarity measure based on graph embedding + + +
+ Link prediction is a key aspect of graph machine learning, with applications +as diverse as disease prediction, social network recommendations, and drug +discovery. It involves predicting new links that may form between network +nodes. Despite the clear importance of link prediction, existing models have +significant shortcomings. Graph Convolutional Networks, for instance, have been +proven to be highly efficient for link prediction on a variety of datasets. +However, they encounter severe limitations when applied to short-path networks +and ego networks, resulting in poor performance. This presents a critical +problem space that this work aims to address. In this paper, we present the +Node Centrality and Similarity Based Parameterised Model (NCSM), a novel method +for link prediction tasks. NCSM uniquely integrates node centrality and +similarity measures as edge features in a customised Graph Neural Network (GNN) +layer, effectively leveraging the topological information of large networks. +This model represents the first parameterised GNN-based link prediction model +that considers topological information. The proposed model was evaluated on +five benchmark graph datasets, each comprising thousands of nodes and edges. +Experimental results highlight NCSM's superiority over existing +state-of-the-art models like Graph Convolutional Networks and Variational Graph +Autoencoder, as it outperforms them across various metrics and datasets. This +exceptional performance can be attributed to NCSM's innovative integration of +node centrality, similarity measures, and its efficient use of topological +information. + +
+
+
+
+
+ + ☆ Neuromorphic Auditory Perception by Neural Spiketrum + + +
+ Neuromorphic computing holds the promise to achieve the energy efficiency and +robust learning performance of biological neural systems. To realize the +promised brain-like intelligence, it needs to solve the challenges of the +neuromorphic hardware architecture design of biological neural substrate and +the hardware amicable algorithms with spike-based encoding and learning. Here +we introduce a neural spike coding model termed spiketrum, to characterize and +transform the time-varying analog signals, typically auditory signals, into +computationally efficient spatiotemporal spike patterns. It minimizes the +information loss occurring at the analog-to-spike transformation and possesses +informational robustness to neural fluctuations and spike losses. The model +provides a sparse and efficient coding scheme with precisely controllable spike +rate that facilitates training of spiking neural networks in various auditory +perception tasks. We further investigate the algorithm-hardware co-designs +through a neuromorphic cochlear prototype which demonstrates that our approach +can provide a systematic solution for spike-based artificial intelligence by +fully exploiting its advantages with spike-based computation. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Learning noise-induced transitions by multi-scaling reservoir computing + + +
+ Noise is usually regarded as adversarial to extract the effective dynamics +from time series, such that the conventional data-driven approaches usually aim +at learning the dynamics by mitigating the noisy effect. However, noise can +have a functional role of driving transitions between stable states underlying +many natural and engineered stochastic dynamics. To capture such stochastic +transitions from data, we find that leveraging a machine learning model, +reservoir computing as a type of recurrent neural network, can learn +noise-induced transitions. We develop a concise training protocol for tuning +hyperparameters, with a focus on a pivotal hyperparameter controlling the time +scale of the reservoir dynamics. The trained model generates accurate +statistics of transition time and the number of transitions. The approach is +applicable to a wide class of systems, including a bistable system under a +double-well potential, with either white noise or colored noise. It is also +aware of the asymmetry of the double-well potential, the rotational dynamics +caused by non-detailed balance, and transitions in multi-stable systems. For +the experimental data of protein folding, it learns the transition time between +folded states, providing a possibility of predicting transition statistics from +a small dataset. The results demonstrate the capability of machine-learning +methods in capturing noise-induced phenomena. + +
+
+
+
+
+ + ☆ Physics-informed reinforcement learning via probabilistic co-adjustment + functions + + +
+ Reinforcement learning of real-world tasks is very data inefficient, and +extensive simulation-based modelling has become the dominant approach for +training systems. However, in human-robot interaction and many other real-world +settings, there is no appropriate one-model-for-all due to differences in +individual instances of the system (e.g. different people) or necessary +oversimplifications in the simulation models. This requires two approaches: 1. +either learning the individual system's dynamics approximately from data which +requires data-intensive training or 2. using a complete digital twin of the +instances, which may not be realisable in many cases. We introduce two +approaches: co-kriging adjustments (CKA) and ridge regression adjustment (RRA) +as novel ways to combine the advantages of both approaches. Our adjustment +methods are based on an auto-regressive AR1 co-kriging model that we integrate +with GP priors. This yield a data- and simulation-efficient way of using +simplistic simulation models (e.g., simple two-link model) and rapidly adapting +them to individual instances (e.g., biomechanics of individual people). Using +CKA and RRA, we obtain more accurate uncertainty quantification of the entire +system's dynamics than pure GP-based and AR1 methods. We demonstrate the +efficiency of co-kriging adjustment with an interpretable reinforcement +learning control example, learning to control a biomechanical human arm using +only a two-link arm simulation model (offline part) and CKA derived from a +small amount of interaction data (on-the-fly online). Our method unlocks an +efficient and uncertainty-aware way to implement reinforcement learning methods +in real world complex systems for which only imperfect simulation models exist. + +
+
+
+
+
+ + ☆ Practical Homomorphic Aggregation for Byzantine ML + + +
+ Due to the large-scale availability of data, machine learning (ML) algorithms +are being deployed in distributed topologies, where different nodes collaborate +to train ML models over their individual data by exchanging model-related +information (e.g., gradients) with a central server. However, distributed +learning schemes are notably vulnerable to two threats. First, Byzantine nodes +can single-handedly corrupt the learning by sending incorrect information to +the server, e.g., erroneous gradients. The standard approach to mitigate such +behavior is to use a non-linear robust aggregation method at the server. +Second, the server can violate the privacy of the nodes. Recent attacks have +shown that exchanging (unencrypted) gradients enables a curious server to +recover the totality of the nodes' data. The use of homomorphic encryption +(HE), a gold standard security primitive, has extensively been studied as a +privacy-preserving solution to distributed learning in non-Byzantine scenarios. +However, due to HE's large computational demand especially for high-dimensional +ML models, there has not yet been any attempt to design purely homomorphic +operators for non-linear robust aggregators. In this work, we present SABLE, +the first completely homomorphic and Byzantine robust distributed learning +algorithm. SABLE essentially relies on a novel plaintext encoding method that +enables us to implement the robust aggregator over batching-friendly BGV. +Moreover, this encoding scheme also accelerates state-of-the-art homomorphic +sorting with larger security margins and smaller ciphertext size. We perform +extensive experiments on image classification tasks and show that our algorithm +achieves practical execution times while matching the ML performance of its +non-private counterpart. + +
+
+
+
+
+ + ☆ Career Path Recommendations for Long-term Income Maximization: A + Reinforcement Learning Approach RecSys + + +
+ This study explores the potential of reinforcement learning algorithms to +enhance career planning processes. Leveraging data from Randstad The +Netherlands, the study simulates the Dutch job market and develops strategies +to optimize employees' long-term income. By formulating career planning as a +Markov Decision Process (MDP) and utilizing machine learning algorithms such as +Sarsa, Q-Learning, and A2C, we learn optimal policies that recommend career +paths with high-income occupations and industries. The results demonstrate +significant improvements in employees' income trajectories, with RL models, +particularly Q-Learning and Sarsa, achieving an average increase of 5% compared +to observed career paths. The study acknowledges limitations, including narrow +job filtering, simplifications in the environment formulation, and assumptions +regarding employment continuity and zero application costs. Future research can +explore additional objectives beyond income optimization and address these +limitations to further enhance career planning processes. + +
+
+ comment: accepted for publication at RecSys in HR '23 (at the 17th ACM + Conference on Recommender Systems) +
+
+
+
+
+ + ☆ Data-Driven Model Reduction and Nonlinear Model Predictive Control of an + Air Separation Unit by Applied Koopman Theory + + +
+ Achieving real-time capability is an essential prerequisite for the +industrial implementation of nonlinear model predictive control (NMPC). +Data-driven model reduction offers a way to obtain low-order control models +from complex digital twins. In particular, data-driven approaches require +little expert knowledge of the particular process and its model, and provide +reduced models of a well-defined generic structure. Herein, we apply our +recently proposed data-driven reduction strategy based on Koopman theory +[Schulze et al. (2022), Comput. Chem. Eng.] to generate a low-order control +model of an air separation unit (ASU). The reduced Koopman model combines +autoencoders and linear latent dynamics and is constructed using machine +learning. Further, we present an NMPC implementation that uses derivative +computation tailored to the fixed block structure of reduced Koopman models. +Our reduction approach with tailored NMPC implementation enables real-time NMPC +of an ASU at an average CPU time decrease by 98 %. + +
+
+
+
+
+ + ☆ Feature-based Transferable Disruption Prediction for future tokamaks + using domain adaptation + + +
+ The high acquisition cost and the significant demand for disruptive +discharges for data-driven disruption prediction models in future tokamaks pose +an inherent contradiction in disruption prediction research. In this paper, we +demonstrated a novel approach to predict disruption in a future tokamak only +using a few discharges based on a domain adaptation algorithm called CORAL. It +is the first attempt at applying domain adaptation in the disruption prediction +task. In this paper, this disruption prediction approach aligns a few data from +the future tokamak (target domain) and a large amount of data from the existing +tokamak (source domain) to train a machine learning model in the existing +tokamak. To simulate the existing and future tokamak case, we selected J-TEXT +as the existing tokamak and EAST as the future tokamak. To simulate the lack of +disruptive data in future tokamak, we only selected 100 non-disruptive +discharges and 10 disruptive discharges from EAST as the target domain training +data. We have improved CORAL to make it more suitable for the disruption +prediction task, called supervised CORAL. Compared to the model trained by +mixing data from the two tokamaks, the supervised CORAL model can enhance the +disruption prediction performance for future tokamaks (AUC value from 0.764 to +0.890). Through interpretable analysis, we discovered that using the supervised +CORAL enables the transformation of data distribution to be more similar to +future tokamak. An assessment method for evaluating whether a model has learned +a trend of similar features is designed based on SHAP analysis. It demonstrates +that the supervised CORAL model exhibits more similarities to the model trained +on large data sizes of EAST. FTDP provides a light, interpretable, and +few-data-required way by aligning features to predict disruption using small +data sizes from the future tokamak. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ EDAC: Efficient Deployment of Audio Classification Models For COVID-19 + Detection + + +
+ The global spread of COVID-19 had severe consequences for public health and +the world economy. The quick onset of the pandemic highlighted the potential +benefits of cheap and deployable pre-screening methods to monitor the +prevalence of the disease in a population. Various researchers made use of +machine learning methods in an attempt to detect COVID-19. The solutions +leverage various input features, such as CT scans or cough audio signals, with +state-of-the-art results arising from deep neural network architectures. +However, larger models require more compute; a pertinent consideration when +deploying to the edge. To address this, we first recreated two models that use +cough audio recordings to detect COVID-19. Through applying network pruning and +quantisation, we were able to compress these two architectures without reducing +the model's predictive performance. Specifically, we were able to achieve an +105.76x and an 19.34x reduction in the compressed model file size with +corresponding 1.37x and 1.71x reductions in the inference times of the two +models. + +
+
+
+
+
+ + ☆ Neural Discovery of Permutation Subgroups + + +
+ We consider the problem of discovering subgroup $H$ of permutation group +$S_{n}$. Unlike the traditional $H$-invariant networks wherein $H$ is assumed +to be known, we present a method to discover the underlying subgroup, given +that it satisfies certain conditions. Our results show that one could discover +any subgroup of type $S_{k} (k \leq n)$ by learning an $S_{n}$-invariant +function and a linear transformation. We also prove similar results for cyclic +and dihedral subgroups. Finally, we provide a general theorem that can be +extended to discover other subgroups of $S_{n}$. We also demonstrate the +applicability of our results through numerical experiments on image-digit sum +and symmetric polynomial regression tasks. + +
+
+
+
+
+ + ☆ Learning Geometric Representations of Objects via Interaction + + +
+ We address the problem of learning representations from observations of a +scene involving an agent and an external object the agent interacts with. To +this end, we propose a representation learning framework extracting the +location in physical space of both the agent and the object from unstructured +observations of arbitrary nature. Our framework relies on the actions performed +by the agent as the only source of supervision, while assuming that the object +is displaced by the agent via unknown dynamics. We provide a theoretical +foundation and formally prove that an ideal learner is guaranteed to infer an +isometric representation, disentangling the agent from the object and correctly +extracting their locations. We evaluate empirically our framework on a variety +of scenarios, showing that it outperforms vision-based approaches such as a +state-of-the-art keypoint extractor. We moreover demonstrate how the extracted +representations enable the agent to solve downstream tasks via reinforcement +learning in an efficient manner. + +
+
+
+
+
+ + ☆ A DRL-based Reflection Enhancement Method for RIS-assisted + Multi-receiver Communications + + +
+ In reconfigurable intelligent surface (RIS)-assisted wireless communication +systems, the pointing accuracy and intensity of reflections depend crucially on +the 'profile,' representing the amplitude/phase state information of all +elements in a RIS array. The superposition of multiple single-reflection +profiles enables multi-reflection for distributed users. However, the +optimization challenges from periodic element arrangements in single-reflection +and multi-reflection profiles are understudied. The combination of periodical +single-reflection profiles leads to amplitude/phase counteractions, affecting +the performance of each reflection beam. This paper focuses on a +dual-reflection optimization scenario and investigates the far-field +performance deterioration caused by the misalignment of overlapped profiles. To +address this issue, we introduce a novel deep reinforcement learning +(DRL)-based optimization method. Comparative experiments against random and +exhaustive searches demonstrate that our proposed DRL method outperforms both +alternatives, achieving the shortest optimization time. Remarkably, our +approach achieves a 1.2 dB gain in the reflection peak gain and a broader beam +without any hardware modifications. + +
+
+ comment: 6 pages, 6 figures. This paper has been accepted for presentation at + the VTC2023-Fall +
+
+
+
+
+ + ☆ PAg-NeRF: Towards fast and efficient end-to-end panoptic 3D + representations for agricultural robotics + + +
+ Precise scene understanding is key for most robot monitoring and intervention +tasks in agriculture. In this work we present PAg-NeRF which is a novel +NeRF-based system that enables 3D panoptic scene understanding. Our +representation is trained using an image sequence with noisy robot odometry +poses and automatic panoptic predictions with inconsistent IDs between frames. +Despite this noisy input, our system is able to output scene geometry, +photo-realistic renders and 3D consistent panoptic representations with +consistent instance IDs. We evaluate this novel system in a very challenging +horticultural scenario and in doing so demonstrate an end-to-end trainable +system that can make use of noisy robot poses rather than precise poses that +have to be pre-calculated. Compared to a baseline approach the peak signal to +noise ratio is improved from 21.34dB to 23.37dB while the panoptic quality +improves from 56.65% to 70.08%. Furthermore, our approach is faster and can be +tuned to improve inference time by more than a factor of 2 while being memory +efficient with approximately 12 times fewer parameters. + +
+
+
+
+
+ + ☆ Stochastic Gradient Descent-like relaxation is equivalent to Glauber + dynamics in discrete optimization and inference problems + + +
+ Is Stochastic Gradient Descent (SGD) substantially different from Glauber +dynamics? This is a fundamental question at the time of understanding the most +used training algorithm in the field of Machine Learning, but it received no +answer until now. Here we show that in discrete optimization and inference +problems, the dynamics of an SGD-like algorithm resemble very closely that of +Metropolis Monte Carlo with a properly chosen temperature, which depends on the +mini-batch size. This quantitative matching holds both at equilibrium and in +the out-of-equilibrium regime, despite the two algorithms having fundamental +differences (e.g.\ SGD does not satisfy detailed balance). Such equivalence +allows us to use results about performances and limits of Monte Carlo +algorithms to optimize the mini-batch size in the SGD-like algorithm and make +it efficient at recovering the signal in hard inference problems. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Neural Koopman prior for data assimilation + + +
+ With the increasing availability of large scale datasets, computational power +and tools like automatic differentiation and expressive neural network +architectures, sequential data are now often treated in a data-driven way, with +a dynamical model trained from the observation data. While neural networks are +often seen as uninterpretable black-box architectures, they can still benefit +from physical priors on the data and from mathematical knowledge. In this +paper, we use a neural network architecture which leverages the long-known +Koopman operator theory to embed dynamical systems in latent spaces where their +dynamics can be described linearly, enabling a number of appealing features. We +introduce methods that enable to train such a model for long-term continuous +reconstruction, even in difficult contexts where the data comes in +irregularly-sampled time series. The potential for self-supervised learning is +also demonstrated, as we show the promising use of trained dynamical models as +priors for variational data assimilation techniques, with applications to e.g. +time series interpolation and forecasting. + +
+
+
+
+
+ + ☆ Fully-Connected Spatial-Temporal Graph for Multivariate Time Series Data + + +
+ Multivariate Time-Series (MTS) data is crucial in various application fields. +With its sequential and multi-source (multiple sensors) properties, MTS data +inherently exhibits Spatial-Temporal (ST) dependencies, involving temporal +correlations between timestamps and spatial correlations between sensors in +each timestamp. To effectively leverage this information, Graph Neural +Network-based methods (GNNs) have been widely adopted. However, existing +approaches separately capture spatial dependency and temporal dependency and +fail to capture the correlations between Different sEnsors at Different +Timestamps (DEDT). Overlooking such correlations hinders the comprehensive +modelling of ST dependencies within MTS data, thus restricting existing GNNs +from learning effective representations. To address this limitation, we propose +a novel method called Fully-Connected Spatial-Temporal Graph Neural Network +(FC-STGNN), including two key components namely FC graph construction and FC +graph convolution. For graph construction, we design a decay graph to connect +sensors across all timestamps based on their temporal distances, enabling us to +fully model the ST dependencies by considering the correlations between DEDT. +Further, we devise FC graph convolution with a moving-pooling GNN layer to +effectively capture the ST dependencies for learning effective representations. +Extensive experiments show the effectiveness of FC-STGNN on multiple MTS +datasets compared to SOTA methods. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ☆ Discrete Denoising Diffusion Approach to Integer Factorization ICANN 2023 + + +
+ Integer factorization is a famous computational problem unknown whether being +solvable in the polynomial time. With the rise of deep neural networks, it is +interesting whether they can facilitate faster factorization. We present an +approach to factorization utilizing deep neural networks and discrete denoising +diffusion that works by iteratively correcting errors in a partially-correct +solution. To this end, we develop a new seq2seq neural network architecture, +employ relaxed categorical distribution and adapt the reverse diffusion process +to cope better with inaccuracies in the denoising step. The approach is able to +find factors for integers of up to 56 bits long. Our analysis indicates that +investment in training leads to an exponential decrease of sampling steps +required at inference to achieve a given success rate, thus counteracting an +exponential run-time increase depending on the bit-length. + +
+
+ comment: International Conference on Artificial Neural Networks ICANN 2023 +
+
+
+
+
+ + ☆ The fine print on tempered posteriors + + +
+ We conduct a detailed investigation of tempered posteriors and uncover a +number of crucial and previously undiscussed points. Contrary to previous +results, we first show that for realistic models and datasets and the tightly +controlled case of the Laplace approximation to the posterior, stochasticity +does not in general improve test accuracy. The coldest temperature is often +optimal. One might think that Bayesian models with some stochasticity can at +least obtain improvements in terms of calibration. However, we show empirically +that when gains are obtained this comes at the cost of degradation in test +accuracy. We then discuss how targeting Frequentist metrics using Bayesian +models provides a simple explanation of the need for a temperature parameter +$\lambda$ in the optimization objective. Contrary to prior works, we finally +show through a PAC-Bayesian analysis that the temperature $\lambda$ cannot be +seen as simply fixing a misspecified prior or likelihood. + +
+
+
+
+
+ + ☆ Can you text what is happening? Integrating pre-trained language + encoders into trajectory prediction models for autonomous driving + + +
+ In autonomous driving tasks, scene understanding is the first step towards +predicting the future behavior of the surrounding traffic participants. Yet, +how to represent a given scene and extract its features are still open research +questions. In this study, we propose a novel text-based representation of +traffic scenes and process it with a pre-trained language encoder. + First, we show that text-based representations, combined with classical +rasterized image representations, lead to descriptive scene embeddings. Second, +we benchmark our predictions on the nuScenes dataset and show significant +improvements compared to baselines. Third, we show in an ablation study that a +joint encoder of text and rasterized images outperforms the individual encoders +confirming that both representations have their complementary strengths. + +
+
+
+
+
+ + ☆ Class-Incremental Grouping Network for Continual Audio-Visual Learning ICCV 2023 + + +
+ Continual learning is a challenging problem in which models need to be +trained on non-stationary data across sequential tasks for class-incremental +learning. While previous methods have focused on using either regularization or +rehearsal-based frameworks to alleviate catastrophic forgetting in image +classification, they are limited to a single modality and cannot learn compact +class-aware cross-modal representations for continual audio-visual learning. To +address this gap, we propose a novel class-incremental grouping network (CIGN) +that can learn category-wise semantic features to achieve continual +audio-visual learning. Our CIGN leverages learnable audio-visual class tokens +and audio-visual grouping to continually aggregate class-aware features. +Additionally, it utilizes class tokens distillation and continual grouping to +prevent forgetting parameters learned from previous tasks, thereby improving +the model's ability to capture discriminative audio-visual categories. We +conduct extensive experiments on VGGSound-Instruments, VGGSound-100, and +VGG-Sound Sources benchmarks. Our experimental results demonstrate that the +CIGN achieves state-of-the-art audio-visual class-incremental learning +performance. Code is available at https://github.com/stoneMo/CIGN. + +
+
+ comment: ICCV 2023. arXiv admin note: text overlap with arXiv:2303.17056 +
+
+
+
+
+ + ☆ Beamforming in Wireless Coded-Caching Systems + + +
+ Increased capacity in the access network poses capacity challenges on the +transport network due to the aggregated traffic. However, there are spatial and +time correlation in the user data demands that could potentially be utilized. +To that end, we investigate a wireless transport network architecture that +integrates beamforming and coded-caching strategies. Especially, our proposed +design entails a server with multiple antennas that broadcasts content to cache +nodes responsible for serving users. Traditional caching methods face the +limitation of relying on the individual memory with additional overhead. Hence, +we develop an efficient genetic algorithm-based scheme for beam optimization in +the coded-caching system. By exploiting the advantages of beamforming and +coded-caching, the architecture achieves gains in terms of multicast +opportunities, interference mitigation, and reduced peak backhaul traffic. A +comparative analysis of this joint design with traditional, un-coded caching +schemes is also conducted to assess the benefits of the proposed approach. +Additionally, we examine the impact of various buffering and decoding methods +on the performance of the coded-caching scheme. Our findings suggest that +proper beamforming is useful in enhancing the effectiveness of the +coded-caching technique, resulting in significant reduction in peak backhaul +traffic. + +
+
+ comment: Submitted to IEEE Future Networks World Forum, 2023 +
+
+
+
+
+ + ☆ CONFLATOR: Incorporating Switching Point based Rotatory Positional + Encodings for Code-Mixed Language Modeling + + +
+ The mixing of two or more languages is called Code-Mixing (CM). CM is a +social norm in multilingual societies. Neural Language Models (NLMs) like +transformers have been very effective on many NLP tasks. However, NLM for CM is +an under-explored area. Though transformers are capable and powerful, they +cannot always encode positional/sequential information since they are +non-recurrent. Therefore, to enrich word information and incorporate positional +information, positional encoding is defined. We hypothesize that Switching +Points (SPs), i.e., junctions in the text where the language switches (L1 -> L2 +or L2-> L1), pose a challenge for CM Language Models (LMs), and hence give +special emphasis to switching points in the modeling process. We experiment +with several positional encoding mechanisms and show that rotatory positional +encodings along with switching point information yield the best results. + We introduce CONFLATOR: a neural language modeling approach for code-mixed +languages. CONFLATOR tries to learn to emphasize switching points using smarter +positional encoding, both at unigram and bigram levels. CONFLATOR outperforms +the state-of-the-art on two tasks based on code-mixed Hindi and English +(Hinglish): (i) sentiment analysis and (ii) machine translation. + +
+
+
+
+
+ + ☆ UniKG: A Benchmark and Universal Embedding for Large-Scale Knowledge + Graphs + + +
+ Irregular data in real-world are usually organized as heterogeneous graphs +(HGs) consisting of multiple types of nodes and edges. To explore useful +knowledge from real-world data, both the large-scale encyclopedic HG datasets +and corresponding effective learning methods are crucial, but haven't been well +investigated. In this paper, we construct a large-scale HG benchmark dataset +named UniKG from Wikidata to facilitate knowledge mining and heterogeneous +graph representation learning. Overall, UniKG contains more than 77 million +multi-attribute entities and 2000 diverse association types, which +significantly surpasses the scale of existing HG datasets. To perform effective +learning on the large-scale UniKG, two key measures are taken, including (i) +the semantic alignment strategy for multi-attribute entities, which projects +the feature description of multi-attribute nodes into a common embedding space +to facilitate node aggregation in a large receptive field; (ii) proposing a +novel plug-and-play anisotropy propagation module (APM) to learn effective +multi-hop anisotropy propagation kernels, which extends methods of large-scale +homogeneous graphs to heterogeneous graphs. These two strategies enable +efficient information propagation among a tremendous number of multi-attribute +entities and meantimes adaptively mine multi-attribute association through the +multi-hop aggregation in large-scale HGs. We set up a node classification task +on our UniKG dataset, and evaluate multiple baseline methods which are +constructed by embedding our APM into large-scale homogenous graph learning +methods. Our UniKG dataset and the baseline codes have been released at +https://github.com/Yide-Qiu/UniKG. + +
+
+ comment: 9 pages, 4 figures +
+
+
+
+
+ + ☆ Generalized Graphon Process: Convergence of Graph Frequencies in + Stretched Cut Distance + + +
+ Graphons have traditionally served as limit objects for dense graph +sequences, with the cut distance serving as the metric for convergence. +However, sparse graph sequences converge to the trivial graphon under the +conventional definition of cut distance, which make this framework inadequate +for many practical applications. In this paper, we utilize the concepts of +generalized graphons and stretched cut distance to describe the convergence of +sparse graph sequences. Specifically, we consider a random graph process +generated from a generalized graphon. This random graph process converges to +the generalized graphon in stretched cut distance. We use this random graph +process to model the growing sparse graph, and prove the convergence of the +adjacency matrices' eigenvalues. We supplement our findings with experimental +validation. Our results indicate the possibility of transfer learning between +sparse graphs. + +
+
+
+
+
+ + ☆ A physics-informed and attention-based graph learning approach for + regional electric vehicle charging demand prediction + + +
+ Along with the proliferation of electric vehicles (EVs), optimizing the use +of EV charging space can significantly alleviate the growing load on +intelligent transportation systems. As the foundation to achieve such an +optimization, a spatiotemporal method for EV charging demand prediction in +urban areas is required. Although several solutions have been proposed by using +data-driven deep learning methods, it can be found that these +performance-oriented methods may suffer from misinterpretations to correctly +handle the reverse relationship between charging demands and prices. To tackle +the emerging challenges of training an accurate and interpretable prediction +model, this paper proposes a novel approach that enables the integration of +graph and temporal attention mechanisms for feature extraction and the usage of +physic-informed meta-learning in the model pre-training step for knowledge +transfer. Evaluation results on a dataset of 18,013 EV charging piles in +Shenzhen, China, show that the proposed approach, named PAG, can achieve +state-of-the-art forecasting performance and the ability in understanding the +adaptive changes in charging demands caused by price fluctuations. + +
+
+ comment: Preprint. This work has been submitted to the IEEE Transactions on + ITS for possible publication. Copyright may be transferred without notice, + after which this version may no longer be accessible +
+
+
+
+
+ + ☆ Examining the Effect of Pre-training on Time Series Classification + + +
+ Although the pre-training followed by fine-tuning paradigm is used +extensively in many fields, there is still some controversy surrounding the +impact of pre-training on the fine-tuning process. Currently, experimental +findings based on text and image data lack consensus. To delve deeper into the +unsupervised pre-training followed by fine-tuning paradigm, we have extended +previous research to a new modality: time series. In this study, we conducted a +thorough examination of 150 classification datasets derived from the Univariate +Time Series (UTS) and Multivariate Time Series (MTS) benchmarks. Our analysis +reveals several key conclusions. (i) Pre-training can only help improve the +optimization process for models that fit the data poorly, rather than those +that fit the data well. (ii) Pre-training does not exhibit the effect of +regularization when given sufficient training time. (iii) Pre-training can only +speed up convergence if the model has sufficient ability to fit the data. (iv) +Adding more pre-training data does not improve generalization, but it can +strengthen the advantage of pre-training on the original data volume, such as +faster convergence. (v) While both the pre-training task and the model +structure determine the effectiveness of the paradigm on a given dataset, the +model structure plays a more significant role. + +
+
+
+
+
+ + ☆ A quantum tug of war between randomness and symmetries on homogeneous + spaces + + +
+ We explore the interplay between symmetry and randomness in quantum +information. Adopting a geometric approach, we consider states as +$H$-equivalent if related by a symmetry transformation characterized by the +group $H$. We then introduce the Haar measure on the homogeneous space +$\mathbb{U}/H$, characterizing true randomness for $H$-equivalent systems. +While this mathematical machinery is well-studied by mathematicians, it has +seen limited application in quantum information: we believe our work to be the +first instance of utilizing homogeneous spaces to characterize symmetry in +quantum information. This is followed by a discussion of approximations of true +randomness, commencing with $t$-wise independent approximations and defining +$t$-designs on $\mathbb{U}/H$ and $H$-equivalent states. Transitioning further, +we explore pseudorandomness, defining pseudorandom unitaries and states within +homogeneous spaces. Finally, as a practical demonstration of our findings, we +study the expressibility of quantum machine learning ansatze in homogeneous +spaces. Our work provides a fresh perspective on the relationship between +randomness and symmetry in the quantum world. + +
+
+ comment: 9 + 1 pages, 3 figures +
+
+
+
+
+ + ☆ SparseSwin: Swin Transformer with Sparse Transformer Block + + +
+ Advancements in computer vision research have put transformer architecture as +the state of the art in computer vision tasks. One of the known drawbacks of +the transformer architecture is the high number of parameters, this can lead to +a more complex and inefficient algorithm. This paper aims to reduce the number +of parameters and in turn, made the transformer more efficient. We present +Sparse Transformer (SparTa) Block, a modified transformer block with an +addition of a sparse token converter that reduces the number of tokens used. We +use the SparTa Block inside the Swin T architecture (SparseSwin) to leverage +Swin capability to downsample its input and reduce the number of initial tokens +to be calculated. The proposed SparseSwin model outperforms other state of the +art models in image classification with an accuracy of 86.96%, 97.43%, and +85.35% on the ImageNet100, CIFAR10, and CIFAR100 datasets respectively. Despite +its fewer parameters, the result highlights the potential of a transformer +architecture using a sparse token converter with a limited number of tokens to +optimize the use of the transformer and improve its performance. + +
+
+
+
+
+ + ☆ Towards Federated Learning Under Resource Constraints via Layer-wise + Training and Depth Dropout + + +
+ Large machine learning models trained on diverse data have recently seen +unprecedented success. Federated learning enables training on private data that +may otherwise be inaccessible, such as domain-specific datasets decentralized +across many clients. However, federated learning can be difficult to scale to +large models when clients have limited resources. This challenge often results +in a trade-off between model size and access to diverse data. To mitigate this +issue and facilitate training of large models on edge devices, we introduce a +simple yet effective strategy, Federated Layer-wise Learning, to simultaneously +reduce per-client memory, computation, and communication costs. Clients train +just a single layer each round, reducing resource costs considerably with +minimal performance degradation. We also introduce Federated Depth Dropout, a +complementary technique that randomly drops frozen layers during training, to +further reduce resource usage. Coupling these two techniques enables us to +effectively train significantly larger models on edge devices. Specifically, we +reduce training memory usage by 5x or more in federated self-supervised +representation learning and demonstrate that performance in downstream tasks is +comparable to conventional federated self-supervised learning. + +
+
+
+
+
+ + ☆ Graph Contextual Contrasting for Multivariate Time Series Classification + + +
+ Contrastive learning, as a self-supervised learning paradigm, becomes popular +for Multivariate Time-Series (MTS) classification. It ensures the consistency +across different views of unlabeled samples and then learns effective +representations for these samples. Existing contrastive learning methods mainly +focus on achieving temporal consistency with temporal augmentation and +contrasting techniques, aiming to preserve temporal patterns against +perturbations for MTS data. However, they overlook spatial consistency that +requires the stability of individual sensors and their correlations. As MTS +data typically originate from multiple sensors, ensuring spatial consistency +becomes essential for the overall performance of contrastive learning on MTS +data. Thus, we propose Graph Contextual Contrasting (GCC) for spatial +consistency across MTS data. Specifically, we propose graph augmentations +including node and edge augmentations to preserve the stability of sensors and +their correlations, followed by graph contrasting with both node- and +graph-level contrasting to extract robust sensor- and global-level features. We +further introduce multi-window temporal contrasting to ensure temporal +consistency in the data for each sensor. Extensive experiments demonstrate that +our proposed GCC achieves state-of-the-art performance on various MTS +classification tasks. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ CARE: Confidence-rich Autonomous Robot Exploration using Bayesian Kernel + Inference and Optimization + + +
+ In this paper, we consider improving the efficiency of information-based +autonomous robot exploration in unknown and complex environments. We first +utilize Gaussian process (GP) regression to learn a surrogate model to infer +the confidence-rich mutual information (CRMI) of querying control actions, then +adopt an objective function consisting of predicted CRMI values and prediction +uncertainties to conduct Bayesian optimization (BO), i.e., GP-based BO (GPBO). +The trade-off between the best action with the highest CRMI value +(exploitation) and the action with high prediction variance (exploration) can +be realized. To further improve the efficiency of GPBO, we propose a novel +lightweight information gain inference method based on Bayesian kernel +inference and optimization (BKIO), achieving an approximate logarithmic +complexity without the need for training. BKIO can also infer the CRMI and +generate the best action using BO with bounded cumulative regret, which ensures +its comparable accuracy to GPBO with much higher efficiency. Extensive +numerical and real-world experiments show the desired efficiency of our +proposed methods without losing exploration performance in different +unstructured, cluttered environments. We also provide our open-source +implementation code at https://github.com/Shepherd-Gregory/BKIO-Exploration. + +
+
+ comment: Full version for the paper accepted by IEEE Robotics and Automation + Letters (RA-L) 2023. arXiv admin note: text overlap with arXiv:2301.00523 +
+
+
+
+
+ + ☆ Does Writing with Language Models Reduce Content Diversity? + + +
+ Large language models (LLMs) have led to a surge in collaborative writing +with model assistance. As different users incorporate suggestions from the same +model, there is a risk of decreased diversity in the produced content, +potentially limiting diverse perspectives in public discourse. In this work, we +measure the impact of co-writing on diversity via a controlled experiment, +where users write argumentative essays in three setups -- using a base LLM +(GPT3), a feedback-tuned LLM (InstructGPT), and writing without model help. We +develop a set of diversity metrics and find that writing with InstructGPT (but +not the GPT3) results in a statistically significant reduction in diversity. +Specifically, it increases the similarity between the writings of different +authors and reduces the overall lexical and content diversity. We additionally +find that this effect is mainly attributable to InstructGPT contributing less +diverse text to co-written essays. In contrast, the user-contributed text +remains unaffected by model collaboration. This suggests that the recent +improvement in generation quality from adapting models to human feedback might +come at the cost of more homogeneous and less diverse content. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Data Summarization beyond Monotonicity: Non-monotone Two-Stage + Submodular Maximization + + +
+ The objective of a two-stage submodular maximization problem is to reduce the +ground set using provided training functions that are submodular, with the aim +of ensuring that optimizing new objective functions over the reduced ground set +yields results comparable to those obtained over the original ground set. This +problem has applications in various domains including data summarization. +Existing studies often assume the monotonicity of the objective function, +whereas our work pioneers the extension of this research to accommodate +non-monotone submodular functions. We have introduced the first constant-factor +approximation algorithms for this more general case. + +
+
+
+
+
+ + ☆ DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning + + +
+ Prompt tuning (PT), where a small amount of trainable soft (continuous) +prompt vectors is affixed to the input of language models (LM), has shown +promising results across various tasks and models for parameter-efficient +fine-tuning (PEFT). PT stands out from other PEFT approaches because it +maintains competitive performance with fewer trainable parameters and does not +drastically scale up its parameters as the model size expands. However, PT +introduces additional soft prompt tokens, leading to longer input sequences, +which significantly impacts training and inference time and memory usage due to +the Transformer's quadratic complexity. Particularly concerning for Large +Language Models (LLMs) that face heavy daily querying. To address this issue, +we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt +into a shorter soft prompt and a pair of low-rank matrices that are then +optimised with two different learning rates. This allows DePT to achieve better +performance while saving over 20% memory and time costs compared to vanilla PT +and its variants, without changing trainable parameter sizes. Through extensive +experiments on 23 natural language processing (NLP) and vision-language (VL) +tasks, we demonstrate that DePT outperforms state-of-the-art PEFT approaches, +including the full fine-tuning baseline in some scenarios. Additionally, we +empirically show that DEPT grows more efficient as the model size increases. +Our further study reveals that DePT integrates seamlessly with +parameter-efficient transfer learning in the few-shot learning setting and +highlights its adaptability to various model architectures and sizes. + +
+
+ comment: Code is available at https://github.com/ZhengxiangShi/DePT +
+
+
+
+
+ + ☆ Reaction coordinate flows for model reduction of molecular kinetics + + +
+ In this work, we introduce a flow based machine learning approach, called +reaction coordinate (RC) flow, for discovery of low-dimensional kinetic models +of molecular systems. The RC flow utilizes a normalizing flow to design the +coordinate transformation and a Brownian dynamics model to approximate the +kinetics of RC, where all model parameters can be estimated in a data-driven +manner. In contrast to existing model reduction methods for molecular kinetics, +RC flow offers a trainable and tractable model of reduced kinetics in +continuous time and space due to the invertibility of the normalizing flow. +Furthermore, the Brownian dynamics-based reduced kinetic model investigated in +this work yields a readily discernible representation of metastable states +within the phase space of the molecular system. Numerical experiments +demonstrate how effectively the proposed method discovers interpretable and +accurate low-dimensional representations of given full-state kinetics from +simulations. + +
+
+
+
+
+ + ☆ Force-directed graph embedding with hops distance + + +
+ Graph embedding has become an increasingly important technique for analyzing +graph-structured data. By representing nodes in a graph as vectors in a +low-dimensional space, graph embedding enables efficient graph processing and +analysis tasks like node classification, link prediction, and visualization. In +this paper, we propose a novel force-directed graph embedding method that +utilizes the steady acceleration kinetic formula to embed nodes in a way that +preserves graph topology and structural features. Our method simulates a set of +customized attractive and repulsive forces between all node pairs with respect +to their hop distance. These forces are then used in Newton's second law to +obtain the acceleration of each node. The method is intuitive, parallelizable, +and highly scalable. We evaluate our method on several graph analysis tasks and +show that it achieves competitive performance compared to state-of-the-art +unsupervised embedding techniques. + +
+
+
+
+
+ + ☆ The bionic neural network for external simulation of human locomotor + system + + +
+ Muscle forces and joint kinematics estimated with musculoskeletal (MSK) +modeling techniques offer useful metrics describing movement quality. +Model-based computational MSK models can interpret the dynamic interaction +between the neural drive to muscles, muscle dynamics, body and joint +kinematics, and kinetics. Still, such a set of solutions suffers from high +computational time and muscle recruitment problems, especially in complex +modeling. In recent years, data-driven methods have emerged as a promising +alternative due to the benefits of flexibility and adaptability. However, a +large amount of labeled training data is not easy to be acquired. This paper +proposes a physics-informed deep learning method based on MSK modeling to +predict joint motion and muscle forces. The MSK model is embedded into the +neural network as an ordinary differential equation (ODE) loss function with +physiological parameters of muscle activation dynamics and muscle contraction +dynamics to be identified. These parameters are automatically estimated during +the training process which guides the prediction of muscle forces combined with +the MSK forward dynamics model. Experimental validations on two groups of data, +including one benchmark dataset and one self-collected dataset from six healthy +subjects, are performed. The results demonstrate that the proposed deep +learning method can effectively identify subject-specific MSK physiological +parameters and the trained physics-informed forward-dynamics surrogate yields +accurate motion and muscle forces predictions. + +
+
+ comment: 10 +
+
+
+
+
+ + ☆ Uncovering mesa-optimization algorithms in Transformers + + +
+ Transformers have become the dominant model in deep learning, but the reason +for their superior performance is poorly understood. Here, we hypothesize that +the strong performance of Transformers stems from an architectural bias towards +mesa-optimization, a learned process running within the forward pass of a model +consisting of the following two steps: (i) the construction of an internal +learning objective, and (ii) its corresponding solution found through +optimization. To test this hypothesis, we reverse-engineer a series of +autoregressive Transformers trained on simple sequence modeling tasks, +uncovering underlying gradient-based mesa-optimization algorithms driving the +generation of predictions. Moreover, we show that the learned forward-pass +optimization algorithm can be immediately repurposed to solve supervised +few-shot tasks, suggesting that mesa-optimization might underlie the in-context +learning capabilities of large language models. Finally, we propose a novel +self-attention layer, the mesa-layer, that explicitly and efficiently solves +optimization problems specified in context. We find that this layer can lead to +improved performance in synthetic and preliminary language modeling +experiments, adding weight to our hypothesis that mesa-optimization is an +important operation hidden within the weights of trained Transformers. + +
+
+
+
+
+ + ☆ Energy Preservation and Stability of Random Filterbanks + + +
+ What makes waveform-based deep learning so hard? Despite numerous attempts at +training convolutional neural networks (convnets) for filterbank design, they +often fail to outperform hand-crafted baselines. This is all the more +surprising because these baselines are linear time-invariant systems: as such, +their transfer functions could be accurately represented by a convnet with a +large receptive field. In this article, we elaborate on the statistical +properties of simple convnets from the mathematical perspective of random +convolutional operators. We find that FIR filterbanks with random Gaussian +weights are ill-conditioned for large filters and locally periodic input +signals, which both are typical in audio signal processing applications. +Furthermore, we observe that expected energy preservation of a random +filterbank is not sufficient for numerical stability and derive theoretical +bounds for its expected frame bounds. + +
+
+ comment: 4 pages, 5 figures, 1 page appendix +
+
+
+
+
+ + ☆ ChemSpaceAL: An Efficient Active Learning Methodology Applied to + Protein-Specific Molecular Generation + + +
+ The incredible capabilities of generative artificial intelligence models have +inevitably led to their application in the domain of drug discovery. It is +therefore of tremendous interest to develop methodologies that enhance the +abilities and applicability of these powerful tools. In this work, we present a +novel and efficient semi-supervised active learning methodology that allows for +the fine-tuning of a generative model with respect to an objective function by +strategically operating within a constructed representation of the sample +space. In the context of targeted molecular generation, we demonstrate the +ability to fine-tune a GPT-based molecular generator with respect to an +attractive interaction-based scoring function by strategically operating within +a chemical space proxy, thereby maximizing attractive interactions between the +generated molecules and a protein target. Importantly, our approach does not +require the individual evaluation of all data points that are used for +fine-tuning, enabling the incorporation of computationally expensive metrics. +We are hopeful that the inherent generality of this methodology ensures that it +will remain applicable as this exciting field evolves. To facilitate +implementation and reproducibility, we have made all of our software available +through the open-source ChemSpaceAL Python package. + +
+
+
+
+
+ + ☆ Effective Abnormal Activity Detection on Multivariate Time Series + Healthcare Data + + +
+ Multivariate time series (MTS) data collected from multiple sensors provide +the potential for accurate abnormal activity detection in smart healthcare +scenarios. However, anomalies exhibit diverse patterns and become unnoticeable +in MTS data. Consequently, achieving accurate anomaly detection is challenging +since we have to capture both temporal dependencies of time series and +inter-relationships among variables. To address this problem, we propose a +Residual-based Anomaly Detection approach, Rs-AD, for effective representation +learning and abnormal activity detection. We evaluate our scheme on a +real-world gait dataset and the experimental results demonstrate an F1 score of +0.839. + +
+
+ comment: Poster accepted by the 29th Annual International Conference On Mobile + Computing And Networking (ACM MobiCom 2023) +
+
+
+
+
+ + ☆ Optimizing Audio Augmentations for Contrastive Learning of + Health-Related Acoustic Signals + + +
+ Health-related acoustic signals, such as cough and breathing sounds, are +relevant for medical diagnosis and continuous health monitoring. Most existing +machine learning approaches for health acoustics are trained and evaluated on +specific tasks, limiting their generalizability across various healthcare +applications. In this paper, we leverage a self-supervised learning framework, +SimCLR with a Slowfast NFNet backbone, for contrastive learning of health +acoustics. A crucial aspect of optimizing Slowfast NFNet for this application +lies in identifying effective audio augmentations. We conduct an in-depth +analysis of various audio augmentation strategies and demonstrate that an +appropriate augmentation strategy enhances the performance of the Slowfast +NFNet audio encoder across a diverse set of health acoustic tasks. Our findings +reveal that when augmentations are combined, they can produce synergistic +effects that exceed the benefits seen when each is applied individually. + +
+
+ comment: 7 pages, 2 pages appendix, 2 figures, 5 appendix tables +
+
+
+
+
+ + ☆ The Safety Filter: A Unified View of Safety-Critical Control in + Autonomous Systems + + +
+ Recent years have seen significant progress in the realm of robot autonomy, +accompanied by the expanding reach of robotic technologies. However, the +emergence of new deployment domains brings unprecedented challenges in ensuring +safe operation of these systems, which remains as crucial as ever. While +traditional model-based safe control methods struggle with generalizability and +scalability, emerging data-driven approaches tend to lack well-understood +guarantees, which can result in unpredictable catastrophic failures. Successful +deployment of the next generation of autonomous robots will require integrating +the strengths of both paradigms. This article provides a review of safety +filter approaches, highlighting important connections between existing +techniques and proposing a unified technical framework to understand, compare, +and combine them. The new unified view exposes a shared modular structure +across a range of seemingly disparate safety filter classes and naturally +suggests directions for future progress towards more scalable synthesis, robust +monitoring, and efficient intervention. + +
+
+ comment: Accepted for publication in Annual Review of Control, Robotics, and + Autonomous Systems +
+
+
+
+
+ + ☆ PACE: Prompting and Augmentation for Calibrated Confidence Estimation + with GPT-4 in Cloud Incident Root Cause Analysis + + +
+ In recent years, the transition to cloud-based platforms in the IT sector has +emphasized the significance of cloud incident root cause analysis to ensure +service reliability and maintain customer trust. Central to this process is the +efficient determination of root causes, a task made challenging due to the +complex nature of contemporary cloud infrastructures. Despite the proliferation +of AI-driven tools for root cause identification, their applicability remains +limited by the inconsistent quality of their outputs. This paper introduces a +method for enhancing confidence estimation in root cause analysis tools by +prompting retrieval-augmented large language models (LLMs). This approach +operates in two phases. Initially, the model evaluates its confidence based on +historical incident data, considering its assessment of the evidence strength. +Subsequently, the model reviews the root cause generated by the predictor. An +optimization step then combines these evaluations to determine the final +confidence assignment. Experimental results illustrate that our method enables +the model to articulate its confidence effectively, providing a more calibrated +score. We address research questions evaluating the ability of our method to +produce calibrated confidence scores using LLMs, the impact of domain-specific +retrieved examples on confidence estimates, and its potential generalizability +across various root cause analysis models. Through this, we aim to bridge the +confidence estimation gap, aiding on-call engineers in decision-making and +bolstering the efficiency of cloud incident management. + +
+
+
+
+
+ + ☆ Instance-Agnostic Geometry and Contact Dynamics Learning + + +
+ This work presents an instance-agnostic learning framework that fuses vision +with dynamics to simultaneously learn shape, pose trajectories and physical +properties via the use of geometry as a shared representation. Unlike many +contact learning approaches that assume motion capture input and a known shape +prior for the collision model, our proposed framework learns an object's +geometric and dynamic properties from RGBD video, without requiring either +category-level or instance-level shape priors. We integrate a vision system, +BundleSDF, with a dynamics system, ContactNets and propose a cyclic training +pipeline to use the output from the dynamics module to refine the poses and the +geometry from the vision module, using perspective reprojection. Experiments +demonstrate our framework's ability to learn the geometry and dynamics of rigid +and convex objects and improve upon the current tracking framework. + +
+
+
+
+
+ + ☆ Studying Accuracy of Machine Learning Models Trained on Lab Lifting Data + in Solving Real-World Problems Using Wearable Sensors for Workplace Safety + + +
+ Porting ML models trained on lab data to real-world situations has long been +a challenge. This paper discusses porting a lab-trained lifting identification +model to the real-world. With performance much lower than on training data, we +explored causes of the failure and proposed four potential solutions to +increase model performance + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Exploring Geometric Deep Learning For Precipitation Nowcasting + + +
+ Precipitation nowcasting (up to a few hours) remains a challenge due to the +highly complex local interactions that need to be captured accurately. +Convolutional Neural Networks rely on convolutional kernels convolving with +grid data and the extracted features are trapped by limited receptive field, +typically expressed in excessively smooth output compared to ground truth. Thus +they lack the capacity to model complex spatial relationships among the grids. +Geometric deep learning aims to generalize neural network models to +non-Euclidean domains. Such models are more flexible in defining nodes and +edges and can effectively capture dynamic spatial relationship among +geographical grids. Motivated by this, we explore a geometric deep +learning-based temporal Graph Convolutional Network (GCN) for precipitation +nowcasting. The adjacency matrix that simulates the interactions among grid +cells is learned automatically by minimizing the L1 loss between prediction and +ground truth pixel value during the training procedure. Then, the spatial +relationship is refined by GCN layers while the temporal information is +extracted by 1D convolution with various kernel lengths. The neighboring +information is fed as auxiliary input layers to improve the final result. We +test the model on sequences of radar reflectivity maps over the Trento/Italy +area. The results show that GCNs improves the effectiveness of modeling the +local details of the cloud profile as well as the prediction accuracy by +achieving decreased error measures. + +
+
+ comment: submitted and accepted in IGARSS2023 +
+
+
+
+
+ + ☆ KD-FixMatch: Knowledge Distillation Siamese Neural Networks ICIP 2023 + + +
+ Semi-supervised learning (SSL) has become a crucial approach in deep learning +as a way to address the challenge of limited labeled data. The success of deep +neural networks heavily relies on the availability of large-scale high-quality +labeled data. However, the process of data labeling is time-consuming and +unscalable, leading to shortages in labeled data. SSL aims to tackle this +problem by leveraging additional unlabeled data in the training process. One of +the popular SSL algorithms, FixMatch, trains identical weight-sharing teacher +and student networks simultaneously using a siamese neural network (SNN). +However, it is prone to performance degradation when the pseudo labels are +heavily noisy in the early training stage. We present KD-FixMatch, a novel SSL +algorithm that addresses the limitations of FixMatch by incorporating knowledge +distillation. The algorithm utilizes a combination of sequential and +simultaneous training of SNNs to enhance performance and reduce performance +degradation. Firstly, an outer SNN is trained using labeled and unlabeled data. +After that, the network of the well-trained outer SNN generates pseudo labels +for the unlabeled data, from which a subset of unlabeled data with trusted +pseudo labels is then carefully created through high-confidence sampling and +deep embedding clustering. Finally, an inner SNN is trained with the labeled +data, the unlabeled data, and the subset of unlabeled data with trusted pseudo +labels. Experiments on four public data sets demonstrate that KD-FixMatch +outperforms FixMatch in all cases. Our results indicate that KD-FixMatch has a +better training starting point that leads to improved model performance +compared to FixMatch. + +
+
+ comment: 5 pages, 1 figure, 5 tables. To be published in ICIP 2023 +
+
+
+
+
+ + ☆ Ensemble-based modeling abstractions for modern self-optimizing systems + + +
+ In this paper, we extend our ensemble-based component model DEECo with the +capability to use machine-learning and optimization heuristics in establishing +and reconfiguration of autonomic component ensembles. We show how to capture +these concepts on the model level and give an example of how such a model can +be beneficially used for modeling access-control related problem in the +Industry 4.0 settings. We argue that incorporating machine-learning and +optimization heuristics is a key feature for modern smart systems which are to +learn over the time and optimize their behavior at runtime to deal with +uncertainty in their environment. + +
+
+ comment: This is the authors' version of the paper - M. T\"opfer, M. Abdullah, + T. Bure\v{s}, P. Hn\v{e}tynka, M. Kruli\v{s}: Ensemble-Based Modeling + Abstractions for Modern Self-optimizing Systems, in Proceedings of ISOLA + 2022, Rhodes, Greece, pp. 318-334, 2022. The final authenticated publication + is available online at https://doi.org/10.1007/978-3-031-19759-8_20 +
+
+
+
+
+ + ☆ Interpretable learning of effective dynamics for multiscale systems + + +
+ The modeling and simulation of high-dimensional multiscale systems is a +critical challenge across all areas of science and engineering. It is broadly +believed that even with today's computer advances resolving all spatiotemporal +scales described by the governing equations remains a remote target. This +realization has prompted intense efforts to develop model order reduction +techniques. In recent years, techniques based on deep recurrent neural networks +have produced promising results for the modeling and simulation of complex +spatiotemporal systems and offer large flexibility in model development as they +can incorporate experimental and computational data. However, neural networks +lack interpretability, which limits their utility and generalizability across +complex systems. Here we propose a novel framework of Interpretable Learning +Effective Dynamics (iLED) that offers comparable accuracy to state-of-the-art +recurrent neural network-based approaches while providing the added benefit of +interpretability. The iLED framework is motivated by Mori-Zwanzig and Koopman +operator theory, which justifies the choice of the specific architecture. We +demonstrate the effectiveness of the proposed framework in simulations of three +benchmark multiscale systems. Our results show that the iLED framework can +generate accurate predictions and obtain interpretable dynamics, making it a +promising approach for solving high-dimensional multiscale systems. + +
+
+
+
+
+ + ☆ Predicting the Radiation Field of Molecular Clouds using Denoising + Diffusion Probabilistic Models + + +
+ Accurately quantifying the impact of radiation feedback in star formation is +challenging. To address this complex problem, we employ deep learning +techniques, denoising diffusion probabilistic models (DDPMs), to predict the +interstellar radiation field (ISRF) strength based on three-band dust emission +at 4.5 \um, 24 \um, and 250 \um. We adopt magnetohydrodynamic simulations from +the STARFORGE (STAR FORmation in Gaseous Environments) project that model star +formation and giant molecular cloud (GMC) evolution. We generate synthetic dust +emission maps matching observed spectral energy distributions in the Monoceros +R2 (MonR2) GMC. We train DDPMs to estimate the ISRF using synthetic three-band +dust emission. The dispersion between the predictions and true values is within +a factor of 0.1 for the test set. We extended our assessment of the diffusion +model to include new simulations with varying physical parameters. While there +is a consistent offset observed in these out-of-distribution simulations, the +model effectively constrains the relative intensity to within a factor of 2. +Meanwhile, our analysis reveals weak correlation between the ISRF solely +derived from dust temperature and the actual ISRF. We apply our trained model +to predict the ISRF in MonR2, revealing a correspondence between intense ISRF, +bright sources, and high dust emission, confirming the model's ability to +capture ISRF variations. Our model robustly predicts radiation feedback +distribution, even in complex, poorly constrained ISRF environments like those +influenced by nearby star clusters. However, precise ISRF predictions require +an accurate training dataset mirroring the target molecular cloud's unique +physical conditions. + +
+
+ comment: Revised submission to ApJ following referee's comments +
+
+
+
+
+ + ☆ SHIFT3D: Synthesizing Hard Inputs For Tricking 3D Detectors ICCV 2023 + + +
+ We present SHIFT3D, a differentiable pipeline for generating 3D shapes that +are structurally plausible yet challenging to 3D object detectors. In +safety-critical applications like autonomous driving, discovering such novel +challenging objects can offer insight into unknown vulnerabilities of 3D +detectors. By representing objects with a signed distanced function (SDF), we +show that gradient error signals allow us to smoothly deform the shape or pose +of a 3D object in order to confuse a downstream 3D detector. Importantly, the +objects generated by SHIFT3D physically differ from the baseline object yet +retain a semantically recognizable shape. Our approach provides interpretable +failure modes for modern 3D object detectors, and can aid in preemptive +discovery of potential safety risks within 3D perception systems before these +risks become critical failures. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ Divergences in Color Perception between Deep Neural Networks and Humans + + +
+ Deep neural networks (DNNs) are increasingly proposed as models of human +vision, bolstered by their impressive performance on image classification and +object recognition tasks. Yet, the extent to which DNNs capture fundamental +aspects of human vision such as color perception remains unclear. Here, we +develop novel experiments for evaluating the perceptual coherence of color +embeddings in DNNs, and we assess how well these algorithms predict human color +similarity judgments collected via an online survey. We find that +state-of-the-art DNN architectures $-$ including convolutional neural networks +and vision transformers $-$ provide color similarity judgments that strikingly +diverge from human color judgments of (i) images with controlled color +properties, (ii) images generated from online searches, and (iii) real-world +images from the canonical CIFAR-10 dataset. We compare DNN performance against +an interpretable and cognitively plausible model of color perception based on +wavelet decomposition, inspired by foundational theories in computational +neuroscience. While one deep learning model $-$ a convolutional DNN trained on +a style transfer task $-$ captures some aspects of human color perception, our +wavelet algorithm provides more coherent color embeddings that better predict +human color judgments compared to all DNNs we examine. These results hold when +altering the high-level visual task used to train similar DNN architectures +(e.g., image classification versus image segmentation), as well as when +examining the color embeddings of different layers in a given DNN architecture. +These findings break new ground in the effort to analyze the perceptual +representations of machine learning algorithms and to improve their ability to +serve as cognitively plausible models of human vision. Implications for machine +learning, human perception, and embodied cognition are discussed. + +
+
+ comment: 22 pages, 8 figures + SI Appendix; to appear in Cognition +
+
+
+
+
+ + ☆ Online ML Self-adaptation in Face of Traps + + +
+ Online machine learning (ML) is often used in self-adaptive systems to +strengthen the adaptation mechanism and improve the system utility. Despite +such benefits, applying online ML for self-adaptation can be challenging, and +not many papers report its limitations. Recently, we experimented with applying +online ML for self-adaptation of a smart farming scenario and we had faced +several unexpected difficulties -- traps -- that, to our knowledge, are not +discussed enough in the community. In this paper, we report our experience with +these traps. Specifically, we discuss several traps that relate to the +specification and online training of the ML-based estimators, their impact on +self-adaptation, and the approach used to evaluate the estimators. Our overview +of these traps provides a list of lessons learned, which can serve as guidance +for other researchers and practitioners when applying online ML for +self-adaptation. + +
+
+ comment: This is the authors' version of the paper M. T\"opfer, F. + Pl\'a\v{s}il, T. Bure\v{s}, P. Hn\v{e}tynka, M. Kruli\v{s}, D. Weyns: Online + ML Self-adaptation in Face of Traps, accepted for publication in Proceedings + of ACSOS 2023, Toronto, Canada +
+
+
+
+
+ + ☆ Revisiting Energy Based Models as Policies: Ranking Noise Contrastive + Estimation and Interpolating Energy Models + + +
+ A crucial design decision for any robot learning pipeline is the choice of +policy representation: what type of model should be used to generate the next +set of robot actions? Owing to the inherent multi-modal nature of many robotic +tasks, combined with the recent successes in generative modeling, researchers +have turned to state-of-the-art probabilistic models such as diffusion models +for policy representation. In this work, we revisit the choice of energy-based +models (EBM) as a policy class. We show that the prevailing folklore -- that +energy models in high dimensional continuous spaces are impractical to train -- +is false. We develop a practical training objective and algorithm for energy +models which combines several key ingredients: (i) ranking noise contrastive +estimation (R-NCE), (ii) learnable negative samplers, and (iii) non-adversarial +joint training. We prove that our proposed objective function is asymptotically +consistent and quantify its limiting variance. On the other hand, we show that +the Implicit Behavior Cloning (IBC) objective is actually biased even at the +population level, providing a mathematical explanation for the poor performance +of IBC trained energy policies in several independent follow-up works. We +further extend our algorithm to learn a continuous stochastic process that +bridges noise and data, modeling this process with a family of EBMs indexed by +scale variable. In doing so, we demonstrate that the core idea behind recent +progress in generative modeling is actually compatible with EBMs. Altogether, +our proposed training algorithms enable us to train energy-based models as +policies which compete with -- and even outperform -- diffusion models and +other state-of-the-art approaches in several challenging multi-modal +benchmarks: obstacle avoidance path planning and contact-rich block pushing. + +
+
+
+
+
+ + ☆ Enhancing Hyperedge Prediction with Context-Aware Self-Supervised + Learning + + +
+ Hypergraphs can naturally model group-wise relations (e.g., a group of users +who co-purchase an item) as hyperedges. Hyperedge prediction is to predict +future or unobserved hyperedges, which is a fundamental task in many real-world +applications (e.g., group recommendation). Despite the recent breakthrough of +hyperedge prediction methods, the following challenges have been rarely +studied: (C1) How to aggregate the nodes in each hyperedge candidate for +accurate hyperedge prediction? and (C2) How to mitigate the inherent data +sparsity problem in hyperedge prediction? To tackle both challenges together, +in this paper, we propose a novel hyperedge prediction framework (CASH) that +employs (1) context-aware node aggregation to precisely capture complex +relations among nodes in each hyperedge for (C1) and (2) self-supervised +contrastive learning in the context of hyperedge prediction to enhance +hypergraph representations for (C2). Furthermore, as for (C2), we propose a +hyperedge-aware augmentation method to fully exploit the latent semantics +behind the original hypergraph and consider both node-level and group-level +contrasts (i.e., dual contrasts) for better node and hyperedge representations. +Extensive experiments on six real-world hypergraphs reveal that CASH +consistently outperforms all competing methods in terms of the accuracy in +hyperedge prediction and each of the proposed strategies is effective in +improving the model accuracy of CASH. For the detailed information of CASH, we +provide the code and datasets at: https://github.com/yy-ko/cash. + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ On the Fine-Grained Hardness of Inverting Generative Models + + +
+ The objective of generative model inversion is to identify a size-$n$ latent +vector that produces a generative model output that closely matches a given +target. This operation is a core computational primitive in numerous modern +applications involving computer vision and NLP. However, the problem is known +to be computationally challenging and NP-hard in the worst case. This paper +aims to provide a fine-grained view of the landscape of computational hardness +for this problem. We establish several new hardness lower bounds for both exact +and approximate model inversion. In exact inversion, the goal is to determine +whether a target is contained within the range of a given generative model. +Under the strong exponential time hypothesis (SETH), we demonstrate that the +computational complexity of exact inversion is lower bounded by $\Omega(2^n)$ +via a reduction from $k$-SAT; this is a strengthening of known results. For the +more practically relevant problem of approximate inversion, the goal is to +determine whether a point in the model range is close to a given target with +respect to the $\ell_p$-norm. When $p$ is a positive odd integer, under SETH, +we provide an $\Omega(2^n)$ complexity lower bound via a reduction from the +closest vectors problem (CVP). Finally, when $p$ is even, under the exponential +time hypothesis (ETH), we provide a lower bound of $2^{\Omega (n)}$ via a +reduction from Half-Clique and Vertex-Cover. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Adaptive User-centered Neuro-symbolic Learning for Multimodal + Interaction with Autonomous Systems ICML2023 + + +
+ Recent advances in machine learning, particularly deep learning, have enabled +autonomous systems to perceive and comprehend objects and their environments in +a perceptual subsymbolic manner. These systems can now perform object +detection, sensor data fusion, and language understanding tasks. However, there +is a growing need to enhance these systems to understand objects and their +environments more conceptually and symbolically. It is essential to consider +both the explicit teaching provided by humans (e.g., describing a situation or +explaining how to act) and the implicit teaching obtained by observing human +behavior (e.g., through the system's sensors) to achieve this level of powerful +artificial intelligence. Thus, the system must be designed with multimodal +input and output capabilities to support implicit and explicit interaction +models. In this position paper, we argue for considering both types of inputs, +as well as human-in-the-loop and incremental learning techniques, for advancing +the field of artificial intelligence and enabling autonomous systems to learn +like humans. We propose several hypotheses and design guidelines and highlight +a use case from related work to achieve this goal. + +
+
+ comment: AI&HCI Workshop accepted paper at ICML2023 and accepted at ICMI2023 + Blue Sky Papers. arXiv admin note: text overlap with arXiv:2211.03539 +
+
+
+
+
+ + ♻ ☆ The EarlyBIRD Catches the Bug: On Exploiting Early Layers of Encoder + Models for More Efficient Code Classification + + +
+ The use of modern Natural Language Processing (NLP) techniques has shown to +be beneficial for software engineering tasks, such as vulnerability detection +and type inference. However, training deep NLP models requires significant +computational resources. This paper explores techniques that aim at achieving +the best usage of resources and available information in these models. + We propose a generic approach, EarlyBIRD, to build composite representations +of code from the early layers of a pre-trained transformer model. We +empirically investigate the viability of this approach on the CodeBERT model by +comparing the performance of 12 strategies for creating composite +representations with the standard practice of only using the last encoder +layer. + Our evaluation on four datasets shows that several early layer combinations +yield better performance on defect detection, and some combinations improve +multi-class classification. More specifically, we obtain a +2 average +improvement of detection accuracy on Devign with only 3 out of 12 layers of +CodeBERT and a 3.3x speed-up of fine-tuning. These findings show that early +layers can be used to obtain better results using the same resources, as well +as to reduce resource usage during fine-tuning and inference. + +
+
+ comment: The content in this pre-print is the same as in the CRC accepted for + publication in the ACM Joint European Software Engineering Conference and + Symposium on the Foundations of Software Engineering (ESEC/FSE 2023) +
+
+
+
+
+ + ♻ ☆ An Overview of Catastrophic AI Risks + + +
+ Rapid advancements in artificial intelligence (AI) have sparked growing +concerns among experts, policymakers, and world leaders regarding the potential +for increasingly advanced AI systems to pose catastrophic risks. Although +numerous risks have been detailed separately, there is a pressing need for a +systematic discussion and illustration of the potential dangers to better +inform efforts to mitigate them. This paper provides an overview of the main +sources of catastrophic AI risks, which we organize into four categories: +malicious use, in which individuals or groups intentionally use AIs to cause +harm; AI race, in which competitive environments compel actors to deploy unsafe +AIs or cede control to AIs; organizational risks, highlighting how human +factors and complex systems can increase the chances of catastrophic accidents; +and rogue AIs, describing the inherent difficulty in controlling agents far +more intelligent than humans. For each category of risk, we describe specific +hazards, present illustrative stories, envision ideal scenarios, and propose +practical suggestions for mitigating these dangers. Our goal is to foster a +comprehensive understanding of these risks and inspire collective and proactive +efforts to ensure that AIs are developed and deployed in a safe manner. +Ultimately, we hope this will allow us to realize the benefits of this powerful +technology while minimizing the potential for catastrophic outcomes. + +
+
+
+
+
+ + ♻ ☆ Open Problems and Fundamental Limitations of Reinforcement Learning from + Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) is a technique for training +AI systems to align with human goals. RLHF has emerged as the central method +used to finetune state-of-the-art large language models (LLMs). Despite this +popularity, there has been relatively little public work systematizing its +flaws. In this paper, we (1) survey open problems and fundamental limitations +of RLHF and related methods; (2) overview techniques to understand, improve, +and complement RLHF in practice; and (3) propose auditing and disclosure +standards to improve societal oversight of RLHF systems. Our work emphasizes +the limitations of RLHF and highlights the importance of a multi-faceted +approach to the development of safer AI systems. + +
+
+
+
+
+ + ♻ ☆ A soft nearest-neighbor framework for continual semi-supervised learning ICCV 2023 + + +
+ Despite significant advances, the performance of state-of-the-art continual +learning approaches hinges on the unrealistic scenario of fully labeled data. +In this paper, we tackle this challenge and propose an approach for continual +semi-supervised learning--a setting where not all the data samples are labeled. +A primary issue in this scenario is the model forgetting representations of +unlabeled data and overfitting the labeled samples. We leverage the power of +nearest-neighbor classifiers to nonlinearly partition the feature space and +flexibly model the underlying data distribution thanks to its non-parametric +nature. This enables the model to learn a strong representation for the current +task, and distill relevant information from previous tasks. We perform a +thorough experimental evaluation and show that our method outperforms all the +existing approaches by large margins, setting a solid state of the art on the +continual semi-supervised learning paradigm. For example, on CIFAR-100 we +surpass several others even when using at least 30 times less supervision (0.8% +vs. 25% of annotations). Finally, our method works well on both low and high +resolution images and scales seamlessly to more complex datasets such as +ImageNet-100. The code is publicly available on +https://github.com/kangzhiq/NNCSL + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ InVAErt networks: a data-driven framework for model synthesis and + identifiability analysis + + +
+ Use of generative models and deep learning for physics-based systems is +currently dominated by the task of emulation. However, the remarkable +flexibility offered by data-driven architectures would suggest to extend this +representation to other aspects of system synthesis including model inversion +and identifiability. We introduce inVAErt (pronounced "invert") networks, a +comprehensive framework for data-driven analysis and synthesis of parametric +physical systems which uses a deterministic encoder and decoder to represent +the forward and inverse solution maps, a normalizing flow to capture the +probabilistic distribution of system outputs, and a variational encoder +designed to learn a compact latent representation for the lack of bijectivity +between inputs and outputs. We formally investigate the selection of penalty +coefficients in the loss function and strategies for latent space sampling, +since we find that these significantly affect both training and testing +performance. We validate our framework through extensive numerical examples, +including simple linear, nonlinear, and periodic maps, dynamical systems, and +spatio-temporal PDEs. + +
+
+
+
+
+ + ♻ ☆ Understanding Sinusoidal Neural Networks + + +
+ In this work, we investigate the structure and representation capacity of +sinusoidal MLPs - multilayer perceptron networks that use sine as the +activation function. These neural networks (known as neural fields) have become +fundamental in representing common signals in computer graphics, such as +images, signed distance functions, and radiance fields. This success can be +primarily attributed to two key properties of sinusoidal MLPs: smoothness and +compactness. These functions are smooth because they arise from the composition +of affine maps with the sine function. This work provides theoretical results +to justify the compactness property of sinusoidal MLPs and provides control +mechanisms in the definition and training of these networks. + We propose to study a sinusoidal MLP by expanding it as a harmonic sum. +First, we observe that its first layer can be seen as a harmonic dictionary, +which we call the input sinusoidal neurons. Then, a hidden layer combines this +dictionary using an affine map and modulates the outputs using the sine, this +results in a special dictionary of sinusoidal neurons. We prove that each of +these sinusoidal neurons expands as a harmonic sum producing a large number of +new frequencies expressed as integer linear combinations of the input +frequencies. Thus, each hidden neuron produces the same frequencies, and the +corresponding amplitudes are completely determined by the hidden affine map. We +also provide an upper bound and a way of sorting these amplitudes that can +control the resulting approximation, allowing us to truncate the corresponding +series. Finally, we present applications for training and initialization of +sinusoidal MLPs. Additionally, we show that if the input neurons are periodic, +then the entire network will be periodic with the same period. We relate these +periodic networks with the Fourier series representation. + +
+
+
+
+
+ + ♻ ☆ Preventing Verbatim Memorization in Language Models Gives a False Sense + of Privacy + + +
+ Studying data memorization in neural language models helps us understand the +risks (e.g., to privacy or copyright) associated with models regurgitating +training data and aids in the development of countermeasures. Many prior works +-- and some recently deployed defenses -- focus on "verbatim memorization", +defined as a model generation that exactly matches a substring from the +training set. We argue that verbatim memorization definitions are too +restrictive and fail to capture more subtle forms of memorization. +Specifically, we design and implement an efficient defense that perfectly +prevents all verbatim memorization. And yet, we demonstrate that this "perfect" +filter does not prevent the leakage of training data. Indeed, it is easily +circumvented by plausible and minimally modified "style-transfer" prompts -- +and in some cases even the non-modified original prompts -- to extract +memorized information. We conclude by discussing potential alternative +definitions and why defining memorization is a difficult yet crucial open +question for neural language models. + +
+
+
+
+
+ + ♻ ☆ Robust Feature-Level Adversaries are Interpretability Tools NeurIPS 2022 + + +
+ The literature on adversarial attacks in computer vision typically focuses on +pixel-level perturbations. These tend to be very difficult to interpret. Recent +work that manipulates the latent representations of image generators to create +"feature-level" adversarial perturbations gives us an opportunity to explore +perceptible, interpretable adversarial attacks. We make three contributions. +First, we observe that feature-level attacks provide useful classes of inputs +for studying representations in models. Second, we show that these adversaries +are uniquely versatile and highly robust. We demonstrate that they can be used +to produce targeted, universal, disguised, physically-realizable, and black-box +attacks at the ImageNet scale. Third, we show how these adversarial images can +be used as a practical interpretability tool for identifying bugs in networks. +We use these adversaries to make predictions about spurious associations +between features and classes which we then test by designing "copy/paste" +attacks in which one natural image is pasted into another to cause a targeted +misclassification. Our results suggest that feature-level attacks are a +promising approach for rigorous interpretability research. They support the +design of tools to better understand what a model has learned and diagnose +brittle feature associations. Code is available at +https://github.com/thestephencasper/feature_level_adv + +
+
+ comment: NeurIPS 2022, code available at + https://github.com/thestephencasper/feature_level_adv +
+
+
+
+
+ + ♻ ☆ Weisfeiler and Lehman Go Measurement Modeling: Probing the Validity of + the WL Test + + +
+ The expressive power of graph neural networks is usually measured by +comparing how many pairs of graphs or nodes an architecture can possibly +distinguish as non-isomorphic to those distinguishable by the $k$-dimensional +Weisfeiler-Lehman ($k$-WL) test. In this paper, we uncover misalignments +between graph machine learning practitioners' conceptualizations of expressive +power and $k$-WL through a systematic analysis of the reliability and validity +of $k$-WL. We conduct a survey ($n = 18$) of practitioners to surface their +conceptualizations of expressive power and their assumptions about $k$-WL. In +contrast to practitioners' opinions, our analysis (which draws from graph +theory and benchmark auditing) reveals that $k$-WL does not guarantee isometry, +can be irrelevant to real-world graph tasks, and may not promote generalization +or trustworthiness. We argue for extensional definitions and measurement of +expressive power based on benchmarks. We further contribute guiding questions +for constructing such benchmarks, which is critical for graph machine learning +practitioners to develop and transparently communicate our understandings of +expressive power. + +
+
+
+
+
+ + ♻ ☆ Explainable AI by BAPC -- Before and After correction Parameter + Comparison + + +
+ A local surrogate for an AI-model correcting a simpler 'base' model is +introduced representing an analytical method to yield explanations of +AI-predictions. The approach is studied here in the context of the base model +being linear regression. The AI-model approximates the residual error of the +linear model and the explanations are formulated in terms of the change of the +interpretable base model's parameters. Criteria are formulated for the precise +relation between lost accuracy of the surrogate, the accuracy of the AI-model, +and the surrogate fidelity. It is shown that, assuming a certain maximal amount +of noise in the observed data, these criteria induce neighborhoods of the +instances to be explained which have an ideal size in terms of maximal accuracy +and fidelity. + +
+
+
+
+
+ + ♻ ☆ SciRE-Solver: Accelerating Diffusion Models Sampling by Score-integrand + Solver with Recursive Difference + + +
+ Diffusion models (DMs) have made significant progress in the fields of image, +audio, and video generation. One downside of DMs is their slow iterative +process. Recent algorithms for fast sampling are designed from the perspective +of differential equations. However, in higher-order algorithms based on Taylor +expansion, estimating the derivative of the score function becomes intractable +due to the complexity of large-scale, well-trained neural networks. Driven by +this motivation, in this work, we introduce the recursive difference (RD) +method to calculate the derivative of the score function in the realm of DMs. +Based on the RD method and the truncated Taylor expansion of score-integrand, +we propose SciRE-Solver with the convergence order guarantee for accelerating +sampling of DMs. To further investigate the effectiveness of the RD method, we +also propose a variant named SciREI-Solver based on the RD method and +exponential integrator. Our proposed sampling algorithms with RD method attain +state-of-the-art (SOTA) FIDs in comparison to existing training-free sampling +algorithms, across both discrete-time and continuous-time pre-trained DMs, +under various number of score function evaluations (NFE). Remarkably, +SciRE-Solver using a small NFEs demonstrates promising potential to surpass the +FID achieved by some pre-trained models in their original papers using no fewer +than $1000$ NFEs. For example, we reach SOTA value of $2.40$ FID with $100$ NFE +for continuous-time DM and of $3.15$ FID with $84$ NFE for discrete-time DM on +CIFAR-10, as well as of $2.17$ (2.02) FID with $18$ (50) NFE for discrete-time +DM on CelebA 64$\times$64. + +
+
+
+
+
+ + ♻ ☆ Model Based Residual Policy Learning with Applications to Antenna + Control + + +
+ Non-differentiable controllers and rule-based policies are widely used for +controlling real systems such as telecommunication networks and robots. +Specifically, parameters of mobile network base station antennas can be +dynamically configured by these policies to improve users coverage and quality +of service. Motivated by the antenna tilt control problem, we introduce +Model-Based Residual Policy Learning (MBRPL), a practical reinforcement +learning (RL) method. MBRPL enhances existing policies through a model-based +approach, leading to improved sample efficiency and a decreased number of +interactions with the actual environment when compared to off-the-shelf RL +methods.To the best of our knowledge, this is the first paper that examines a +model-based approach for antenna control. Experimental results reveal that our +method delivers strong initial performance while improving sample efficiency +over previous RL methods, which is one step towards deploying these algorithms +in real networks. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Neural Networks for Prognostics and Health Management + of Lithium-Ion Batteries + + +
+ For Prognostics and Health Management (PHM) of Lithium-ion (Li-ion) +batteries, many models have been established to characterize their degradation +process. The existing empirical or physical models can reveal important +information regarding the degradation dynamics. However, there are no general +and flexible methods to fuse the information represented by those models. +Physics-Informed Neural Network (PINN) is an efficient tool to fuse empirical +or physical dynamic models with data-driven models. To take full advantage of +various information sources, we propose a model fusion scheme based on PINN. It +is implemented by developing a semi-empirical semi-physical Partial +Differential Equation (PDE) to model the degradation dynamics of Li-ion +batteries. When there is little prior knowledge about the dynamics, we leverage +the data-driven Deep Hidden Physics Model (DeepHPM) to discover the underlying +governing dynamic models. The uncovered dynamics information is then fused with +that mined by the surrogate neural network in the PINN framework. Moreover, an +uncertainty-based adaptive weighting method is employed to balance the multiple +learning tasks when training the PINN. The proposed methods are verified on a +public dataset of Li-ion Phosphate (LFP)/graphite batteries. + +
+
+ comment: 14 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Several fitness functions and entanglement gates in quantum kernel + generation + + +
+ Quantum machine learning (QML) represents a promising frontier in the realm +of quantum technologies. In this pursuit of quantum advantage, the quantum +kernel method for support vector machine has emerged as a powerful approach. +Entanglement, a fundamental concept in quantum mechanics, assumes a central +role in quantum computing. In this paper, we study the necessities of +entanglement gates in the quantum kernel methods. We present several fitness +functions for a multi-objective genetic algorithm that simultaneously maximizes +classification accuracy while minimizing both the local and non-local gate +costs of the quantum feature map's circuit. We conduct comparisons with +classical classifiers to gain insights into the benefits of employing +entanglement gates. Surprisingly, our experiments reveal that the optimal +configuration of quantum circuits for the quantum kernel method incorporates a +proportional number of non-local gates for entanglement, contrary to previous +literature where non-local gates were largely suppressed. + Furthermore, we demonstrate that the separability indexes of data can be +effectively leveraged to determine the number of non-local gates required for +the quantum support vector machine's feature maps. This insight can +significantly aid in selecting appropriate parameters, such as the entanglement +parameter, in various quantum programming packages like https://qiskit.org/ +based on data analysis. Our findings offer valuable guidance for enhancing the +efficiency and accuracy of quantum machine learning algorithm + +
+
+
+
+
+ + ♻ ☆ Adaptive Top-K in SGD for Communication-Efficient Distributed Learning + + +
+ Distributed stochastic gradient descent (SGD) with gradient compression has +become a popular communication-efficient solution for accelerating distributed +learning. One commonly used method for gradient compression is Top-K +sparsification, which sparsifies the gradients by a fixed degree during model +training. However, there has been a lack of an adaptive approach to adjust the +sparsification degree to maximize the potential of the model's performance or +training speed. This paper proposes a novel adaptive Top-K in SGD framework +that enables an adaptive degree of sparsification for each gradient descent +step to optimize the convergence performance by balancing the trade-off between +communication cost and convergence error. Firstly, an upper bound of +convergence error is derived for the adaptive sparsification scheme and the +loss function. Secondly, an algorithm is designed to minimize the convergence +error under the communication cost constraints. Finally, numerical results on +the MNIST and CIFAR-10 datasets demonstrate that the proposed adaptive Top-K +algorithm in SGD achieves a significantly better convergence rate compared to +state-of-the-art methods, even after considering error compensation. + +
+
+ comment: 6 pages, 10 figures, has been accepted by GlobeCom 2023 +
+
+
+
+
+ + ♻ ☆ Leveraging Reviews: Learning to Price with Buyer and Seller Uncertainty + + +
+ In online marketplaces, customers have access to hundreds of reviews for a +single product. Buyers often use reviews from other customers that share their +type -- such as height for clothing, skin type for skincare products, and +location for outdoor furniture -- to estimate their values, which they may not +know a priori. Customers with few relevant reviews may hesitate to make a +purchase except at a low price, so for the seller, there is a tension between +setting high prices and ensuring that there are enough reviews so that buyers +can confidently estimate their values. Simultaneously, sellers may use reviews +to gauge the demand for items they wish to sell. + In this work, we study this pricing problem in an online setting where the +seller interacts with a set of buyers of finitely many types, one by one, over +a series of $T$ rounds. At each round, the seller first sets a price. Then a +buyer arrives and examines the reviews of the previous buyers with the same +type, which reveal those buyers' ex-post values. Based on the reviews, the +buyer decides to purchase if they have good reason to believe that their +ex-ante utility is positive. Crucially, the seller does not know the buyer's +type when setting the price, nor even the distribution over types. We provide a +no-regret algorithm that the seller can use to obtain high revenue. When there +are $d$ types, after $T$ rounds, our algorithm achieves a problem-independent +$\tilde O(T^{2/3}d^{1/3})$ regret bound. However, when the smallest probability +$q_{\text{min}}$ that any given type appears is large, specifically when +$q_{\text{min}} \in \Omega(d^{-2/3}T^{-1/3})$, then the same algorithm achieves +a $\tilde O(T^{1/2}q_{\text{min}}^{-1/2})$ regret bound. We complement these +upper bounds with matching lower bounds in both regimes, showing that our +algorithm is minimax optimal up to lower-order terms. + +
+
+
+
+
+ + ♻ ☆ An Algorithm with Optimal Dimension-Dependence for Zero-Order Nonsmooth + Nonconvex Stochastic Optimization + + +
+ We study the complexity of producing $(\delta,\epsilon)$-stationary points of +Lipschitz objectives which are possibly neither smooth nor convex, using only +noisy function evaluations. Recent works proposed several stochastic zero-order +algorithms that solve this task, all of which suffer from a +dimension-dependence of $\Omega(d^{3/2})$ where $d$ is the dimension of the +problem, which was conjectured to be optimal. We refute this conjecture by +providing a faster algorithm that has complexity +$O(d\delta^{-1}\epsilon^{-3})$, which is optimal (up to numerical constants) +with respect to $d$ and also optimal with respect to the accuracy parameters +$\delta,\epsilon$, thus solving an open question due to Lin et al. +(NeurIPS'22). Moreover, the convergence rate achieved by our algorithm is also +optimal for smooth objectives, proving that in the nonconvex stochastic +zero-order setting, nonsmooth optimization is as easy as smooth optimization. +We provide algorithms that achieve the aforementioned convergence rate in +expectation as well as with high probability. Our analysis is based on a simple +yet powerful geometric lemma regarding the Goldstein-subdifferential set, which +allows utilizing recent advancements in first-order nonsmooth nonconvex +optimization. + +
+
+ comment: Fixed hyperparameter assignments in main theorems (results + unaffected); some minor edits +
+
+
+
+
+ + ♻ ☆ Efficient Defense Against Model Stealing Attacks on Convolutional Neural + Networks ICML + + +
+ Model stealing attacks have become a serious concern for deep learning +models, where an attacker can steal a trained model by querying its black-box +API. This can lead to intellectual property theft and other security and +privacy risks. The current state-of-the-art defenses against model stealing +attacks suggest adding perturbations to the prediction probabilities. However, +they suffer from heavy computations and make impracticable assumptions about +the adversary. They often require the training of auxiliary models. This can be +time-consuming and resource-intensive which hinders the deployment of these +defenses in real-world applications. In this paper, we propose a simple yet +effective and efficient defense alternative. We introduce a heuristic approach +to perturb the output probabilities. The proposed defense can be easily +integrated into models without additional training. We show that our defense is +effective in defending against three state-of-the-art stealing attacks. We +evaluate our approach on large and quantized (i.e., compressed) Convolutional +Neural Networks (CNNs) trained on several vision datasets. Our technique +outperforms the state-of-the-art defenses with a $\times37$ faster inference +latency without requiring any additional model and with a low impact on the +model's performance. We validate that our defense is also effective for +quantized CNNs targeting edge devices. + +
+
+ comment: Accepted for publication at 2023 International Conference on Machine + Learning and Applications (ICMLA). Proceedings of ICMLA, Florida, USA + \c{opyright}2023 IEEE +
+
+
+
+
+ + ♻ ☆ On Reducing Undesirable Behavior in Deep Reinforcement Learning Models + + +
+ Deep reinforcement learning (DRL) has proven extremely useful in a large +variety of application domains. However, even successful DRL-based software can +exhibit highly undesirable behavior. This is due to DRL training being based on +maximizing a reward function, which typically captures general trends but +cannot precisely capture, or rule out, certain behaviors of the system. In this +paper, we propose a novel framework aimed at drastically reducing the +undesirable behavior of DRL-based software, while maintaining its excellent +performance. In addition, our framework can assist in providing engineers with +a comprehensible characterization of such undesirable behavior. Under the hood, +our approach is based on extracting decision tree classifiers from erroneous +state-action pairs, and then integrating these trees into the DRL training +loop, penalizing the system whenever it performs an error. We provide a +proof-of-concept implementation of our approach, and use it to evaluate the +technique on three significant case studies. We find that our approach can +extend existing frameworks in a straightforward manner, and incurs only a +slight overhead in training time. Further, it incurs only a very slight hit to +performance, or even in some cases - improves it, while significantly reducing +the frequency of undesirable behavior. + +
+
+
+
+
+ + ♻ ☆ Are Deep Neural Networks SMARTer than Second Graders? CVPR 2023 + + +
+ Recent times have witnessed an increasing number of applications of deep +neural networks towards solving tasks that require superior cognitive +abilities, e.g., playing Go, generating art, ChatGPT, etc. Such a dramatic +progress raises the question: how generalizable are neural networks in solving +problems that demand broad skills? To answer this question, we propose SMART: a +Simple Multimodal Algorithmic Reasoning Task and the associated SMART-101 +dataset, for evaluating the abstraction, deduction, and generalization +abilities of neural networks in solving visuo-linguistic puzzles designed +specifically for children in the 6--8 age group. Our dataset consists of 101 +unique puzzles; each puzzle comprises a picture and a question, and their +solution needs a mix of several elementary skills, including arithmetic, +algebra, and spatial reasoning, among others. To scale our dataset towards +training deep neural networks, we programmatically generate entirely new +instances for each puzzle, while retaining their solution algorithm. To +benchmark performances on SMART-101, we propose a vision and language +meta-learning model using varied state-of-the-art backbones. Our experiments +reveal that while powerful deep models offer reasonable performances on puzzles +in a supervised setting, they are not better than random accuracy when analyzed +for generalization. We also evaluate the recent ChatGPT and other large +language models on a subset of SMART-101 and find that while these models show +convincing reasoning abilities, the answers are often incorrect. + +
+
+ comment: Extended version of CVPR 2023 paper. For the SMART-101 dataset, see + http://smartdataset.github.io/smart101 +
+
+
+
+
+ + ♻ ☆ Quantum Ridgelet Transform: Winning Lottery Ticket of Neural Networks + with Quantum Computation + + +
+ A significant challenge in the field of quantum machine learning (QML) is to +establish applications of quantum computation to accelerate common tasks in +machine learning such as those for neural networks. Ridgelet transform has been +a fundamental mathematical tool in the theoretical studies of neural networks, +but the practical applicability of ridgelet transform to conducting learning +tasks was limited since its numerical implementation by conventional classical +computation requires an exponential runtime $\exp(O(D))$ as data dimension $D$ +increases. To address this problem, we develop a quantum ridgelet transform +(QRT), which implements the ridgelet transform of a quantum state within a +linear runtime $O(D)$ of quantum computation. As an application, we also show +that one can use QRT as a fundamental subroutine for QML to efficiently find a +sparse trainable subnetwork of large shallow wide neural networks without +conducting large-scale optimization of the original network. This application +discovers an efficient way in this regime to demonstrate the lottery ticket +hypothesis on finding such a sparse trainable neural network. These results +open an avenue of QML for accelerating learning tasks with commonly used +classical neural networks. + +
+
+ comment: 27 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ No Train Still Gain. Unleash Mathematical Reasoning of Large Language + Models with Monte Carlo Tree Search Guided by Energy Function + + +
+ Large language models (LLMs) demonstrate impressive language understanding +and contextual learning abilities, making them suitable for natural language +processing (NLP) tasks and complex mathematical reasoning. However, when +applied to mathematical reasoning tasks, LLMs often struggle to generate +correct reasoning steps and answers despite having high probabilities for the +solutions. To overcome this limitation and enhance the mathematical reasoning +capabilities of fine-tuned LLMs without additional fine-tuning steps, we +propose a method that incorporates Monte Carlo Tree Search (MCTS) and a +lightweight energy function to rank decision steps and enable immediate +reaction and precise reasoning. Specifically, we re-formulate the fine-tuned +LLMs into a Residual-based Energy Model (Residual-EBM) and employ noise +contrastive estimation to estimate the energy function's parameters. We then +utilize MCTS with the energy function as a path verifier to search the output +space and evaluate the reasoning path. Through extensive experiments on two +mathematical reasoning benchmarks, GSM8k and AQUA-RAT, we demonstrate the +exceptional capabilities of our method, which significantly improves the pass@1 +metric of the fine-tuned model without requiring additional fine-tuning or +reinforcement learning with human feedback alignment. + +
+
+ comment: still in progress +
+
+
+
+
+ + ♻ ☆ Graph Neural Network Interatomic Potential Ensembles with Calibrated + Aleatoric and Epistemic Uncertainty on Energy and Forces + + +
+ Inexpensive machine learning potentials are increasingly being used to speed +up structural optimization and molecular dynamics simulations of materials by +iteratively predicting and applying interatomic forces. In these settings, it +is crucial to detect when predictions are unreliable to avoid wrong or +misleading results. Here, we present a complete framework for training and +recalibrating graph neural network ensemble models to produce accurate +predictions of energy and forces with calibrated uncertainty estimates. The +proposed method considers both epistemic and aleatoric uncertainty and the +total uncertainties are recalibrated post hoc using a nonlinear scaling +function to achieve good calibration on previously unseen data, without loss of +predictive accuracy. The method is demonstrated and evaluated on two +challenging, publicly available datasets, ANI-1x (Smith et al.) and +Transition1x (Schreiner et al.), both containing diverse conformations far from +equilibrium. A detailed analysis of the predictive performance and uncertainty +calibration is provided. In all experiments, the proposed method achieved low +prediction error and good uncertainty calibration, with predicted uncertainty +correlating with expected error, on energy and forces. To the best of our +knowledge, the method presented in this paper is the first to consider a +complete framework for obtaining calibrated epistemic and aleatoric uncertainty +predictions on both energy and forces in ML potentials. + +
+
+
+
+
+ + ♻ ☆ Efficient ECG-based Atrial Fibrillation Detection via Parameterised + Hypercomplex Neural Networks + + +
+ Atrial fibrillation (AF) is the most common cardiac arrhythmia and associated +with a high risk for serious conditions like stroke. The use of wearable +devices embedded with automatic and timely AF assessment from +electrocardiograms (ECGs) has shown to be promising in preventing +life-threatening situations. Although deep neural networks have demonstrated +superiority in model performance, their use on wearable devices is limited by +the trade-off between model performance and complexity. In this work, we +propose to use lightweight convolutional neural networks (CNNs) with +parameterised hypercomplex (PH) layers for AF detection based on ECGs. The +proposed approach trains small-scale CNNs, thus overcoming the limited +computing resources on wearable devices. We show comparable performance to +corresponding real-valued CNNs on two publicly available ECG datasets using +significantly fewer model parameters. PH models are more flexible than other +hypercomplex neural networks and can operate on any number of input ECG leads. + +
+
+ comment: Published at EUSIPCO 2023 +
+
+
+
+
+ + ♻ ☆ TSMixer: An All-MLP Architecture for Time Series Forecasting + + +
+ Real-world time-series datasets are often multivariate with complex dynamics. +To capture this complexity, high capacity architectures like recurrent- or +attention-based sequential deep learning models have become popular. However, +recent work demonstrates that simple univariate linear models can outperform +such deep learning models on several commonly used academic benchmarks. +Extending them, in this paper, we investigate the capabilities of linear models +for time-series forecasting and present Time-Series Mixer (TSMixer), a novel +architecture designed by stacking multi-layer perceptrons (MLPs). TSMixer is +based on mixing operations along both the time and feature dimensions to +extract information efficiently. On popular academic benchmarks, the +simple-to-implement TSMixer is comparable to specialized state-of-the-art +models that leverage the inductive biases of specific benchmarks. On the +challenging and large scale M5 benchmark, a real-world retail dataset, TSMixer +demonstrates superior performance compared to the state-of-the-art +alternatives. Our results underline the importance of efficiently utilizing +cross-variate and auxiliary information for improving the performance of time +series forecasting. We present various analyses to shed light into the +capabilities of TSMixer. The design paradigms utilized in TSMixer are expected +to open new horizons for deep learning-based time series forecasting. The +implementation is available at +https://github.com/google-research/google-research/tree/master/tsmixer + +
+
+
+
+
+ + ♻ ☆ Document Understanding Dataset and Evaluation (DUDE) ICCV 2023 + + +
+ We call on the Document AI (DocAI) community to reevaluate current +methodologies and embrace the challenge of creating more practically-oriented +benchmarks. Document Understanding Dataset and Evaluation (DUDE) seeks to +remediate the halted research progress in understanding visually-rich documents +(VRDs). We present a new dataset with novelties related to types of questions, +answers, and document layouts based on multi-industry, multi-domain, and +multi-page VRDs of various origins, and dates. Moreover, we are pushing the +boundaries of current methods by creating multi-task and multi-domain +evaluation setups that more accurately simulate real-world situations where +powerful generalization and adaptation under low-resource settings are desired. +DUDE aims to set a new standard as a more practical, long-standing benchmark +for the community, and we hope that it will lead to future extensions and +contributions that address real-world challenges. Finally, our work illustrates +the importance of finding more efficient ways to model language, images, and +layout in DocAI. + +
+
+ comment: Accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Adaptive Tracking of a Single-Rigid-Body Character in Various + Environments + + +
+ Since the introduction of DeepMimic [Peng et al. 2018], subsequent research +has focused on expanding the repertoire of simulated motions across various +scenarios. In this study, we propose an alternative approach for this goal, a +deep reinforcement learning method based on the simulation of a +single-rigid-body character. Using the centroidal dynamics model (CDM) to +express the full-body character as a single rigid body (SRB) and training a +policy to track a reference motion, we can obtain a policy that is capable of +adapting to various unobserved environmental changes and controller transitions +without requiring any additional learning. Due to the reduced dimension of +state and action space, the learning process is sample-efficient. The final +full-body motion is kinematically generated in a physically plausible way, +based on the state of the simulated SRB character. The SRB simulation is +formulated as a quadratic programming (QP) problem, and the policy outputs an +action that allows the SRB character to follow the reference motion. We +demonstrate that our policy, efficiently trained within 30 minutes on an +ultraportable laptop, has the ability to cope with environments that have not +been experienced during learning, such as running on uneven terrain or pushing +a box, and transitions between learned policies, without any additional +learning. + +
+
+
+
+
+ + ♻ ☆ MMD-Regularized Unbalanced Optimal Transport + + +
+ We study the unbalanced optimal transport (UOT) problem, where the marginal +constraints are enforced using Maximum Mean Discrepancy (MMD) regularization. +Our work is motivated by the observation that the literature on UOT is focused +on regularization based on $\phi$-divergence (e.g., KL divergence). Despite the +popularity of MMD, its role as a regularizer in the context of UOT seems less +understood. We begin by deriving a specific dual of MMD-regularized UOT +(MMD-UOT), which helps us prove several useful properties. One interesting +outcome of this duality result is that MMD-UOT induces novel metrics, which not +only lift the ground metric like the Wasserstein but are also sample-wise +efficient to estimate like the MMD. Further, for real-world applications +involving non-discrete measures, we present an estimator for the transport plan +that is supported only on the given ($m$) samples. Under mild conditions, we +prove that the estimation error with this finitely-supported transport plan is +also $\mathcal{O}(1/\sqrt{m})$. As far as we know, such error bounds that are +free from the curse of dimensionality are not known for $\phi$-divergence +regularized UOT. Finally, we discuss how the proposed estimator can be computed +efficiently using accelerated gradient descent. Our experiments show that +MMD-UOT consistently outperforms popular baselines, including KL-regularized +UOT and MMD, in diverse machine learning applications. + +
+
+
+
+
+ + ♻ ☆ Analysing high resolution digital Mars images using machine learning + + +
+ The search for ephemeral liquid water on Mars is an ongoing activity. After +the recession of the seasonal polar ice cap on Mars, small water ice patches +may be left behind in shady places due to the low thermal conductivity of the +Martian surface and atmosphere. During late spring and early summer, these +patches may be exposed to direct sunlight and warm up rapidly enough for the +liquid phase to emerge. To see the spatial and temporal occurrence of such ice +patches, optical images should be searched for and checked. Previously a manual +image analysis was conducted on 110 images from the southern hemisphere, +captured by the High Resolution Imaging Science Experiment (HiRISE) camera +onboard the Mars Reconnaissance Orbiter space mission. Out of these, 37 images +were identified with smaller ice patches, which were distinguishable by their +brightness, colour and strong connection to local topographic shading. In this +study, a convolutional neural network (CNN) is applied to find further images +with potential water ice patches in the latitude band between -40{\deg} and +-60{\deg}, where the seasonal retreat of the polar ice cap happens. Previously +analysed HiRISE images were used to train the model, where each image was split +into hundreds of pieces (chunks), expanding the training dataset to 6240 +images. A test run conducted on 38 new HiRISE images indicates that the program +can generally recognise small bright patches, however further training might be +needed for more precise identification. This further training has been +conducted now, incorporating the results of the previous test run. To retrain +the model, 18646 chunks were analysed and 48 additional epochs were ran. In the +end the model produced a 94% accuracy in recognising ice, 58% of these images +showed small enough ice patches on them. The rest of the images was covered by +too much ice or showed CO2 ice sublimation in some places. + +
+
+
+
+
+ + ♻ ☆ MLLM-DataEngine: An Iterative Refinement Approach for MLLM + + +
+ Despite the great advance of Multimodal Large Language Models (MLLMs) in both +instruction dataset building and benchmarking, the independence of training and +evaluation makes current MLLMs hard to further improve their capability under +the guidance of evaluation results with a relatively low human cost. In this +paper, we propose MLLM-DataEngine, a novel closed-loop system that bridges data +generation, model training, and evaluation. Within each loop iteration, the +MLLM-DataEngine first analyze the weakness of the model based on the evaluation +results, then generate a proper incremental dataset for the next training +iteration and enhance the model capability iteratively. Compared with previous +data collection methods which are separate from the benchmarking, the data +generated by MLLM-DataEngine shows better targeting, quality, and correctness. +For targeting, we propose an Adaptive Bad-case Sampling module, which adjusts +the ratio of different types of data within each incremental dataset based on +the benchmarking results. For quality, we resort to GPT-4 to generate +high-quality data with each given data type. For correctness, prompt design is +critical for the data generation results. Rather than previous hand-crafted +prompt, we propose an Interactive Prompt Optimization strategy, which optimizes +the prompt with the multi-round interaction between human and GPT, and improve +the correctness of generated data greatly. Through extensive experiments, we +find our MLLM-DataEngine could boost the MLLM capability in a targeted and +automatic manner, with only a few human participation. We hope it could be a +general solution for the following MLLMs building. The MLLM-DataEngine has been +open-sourced and is now available at +https://github.com/opendatalab/MLLM-DataEngine. + +
+
+ comment: Code and models are available at + https://github.com/opendatalab/MLLM-DataEngine +
+
+
+
+
+ + ♻ ☆ Conditional expectation with regularization for missing data imputation + + +
+ Missing data frequently occurs in datasets across various domains, such as +medicine, sports, and finance. In many cases, to enable proper and reliable +analyses of such data, the missing values are often imputed, and it is +necessary that the method used has a low root mean square error (RMSE) between +the imputed and the true values. In addition, for some critical applications, +it is also often a requirement that the imputation method is scalable and the +logic behind the imputation is explainable, which is especially difficult for +complex methods that are, for example, based on deep learning. Based on these +considerations, we propose a new algorithm named "conditional +Distribution-based Imputation of Missing Values with Regularization" (DIMV). +DIMV operates by determining the conditional distribution of a feature that has +missing entries, using the information from the fully observed features as a +basis. As will be illustrated via experiments in the paper, DIMV (i) gives a +low RMSE for the imputed values compared to state-of-the-art methods; (ii) fast +and scalable; (iii) is explainable as coefficients in a regression model, +allowing reliable and trustable analysis, makes it a suitable choice for +critical domains where understanding is important such as in medical fields, +finance, etc; (iv) can provide an approximated confidence region for the +missing values in a given sample; (v) suitable for both small and large scale +data; (vi) in many scenarios, does not require a huge number of parameters as +deep learning approaches; (vii) handle multicollinearity in imputation +effectively; and (viii) is robust to the normally distributed assumption that +its theoretical grounds rely on. + +
+
+
+
+
+ + ♻ ☆ Learning to Optimize Quasi-Newton Methods + + +
+ Fast gradient-based optimization algorithms have become increasingly +essential for the computationally efficient training of machine learning +models. One technique is to multiply the gradient by a preconditioner matrix to +produce a step, but it is unclear what the best preconditioner matrix is. This +paper introduces a novel machine learning optimizer called LODO, which tries to +online meta-learn the best preconditioner during optimization. Specifically, +our optimizer merges Learning to Optimize (L2O) techniques with quasi-Newton +methods to learn preconditioners parameterized as neural networks; they are +more flexible than preconditioners in other quasi-Newton methods. Unlike other +L2O methods, LODO does not require any meta-training on a training task +distribution, and instead learns to optimize on the fly while optimizing on the +test task, adapting to the local characteristics of the loss landscape while +traversing it. Theoretically, we show that our optimizer approximates the +inverse Hessian in noisy loss landscapes and is capable of representing a wide +range of inverse Hessians. We experimentally verify that our algorithm can +optimize in noisy settings, and show that simpler alternatives for representing +the inverse Hessians worsen performance. Lastly, we use our optimizer to train +a semi-realistic deep neural network with 95k parameters at speeds comparable +to those of standard neural network optimizers. + +
+
+
+
+
+ + ♻ ☆ Interpreting and Correcting Medical Image Classification with PIP-Net ECAI 2023 + + +
+ Part-prototype models are explainable-by-design image classifiers, and a +promising alternative to black box AI. This paper explores the applicability +and potential of interpretable machine learning, in particular PIP-Net, for +automated diagnosis support on real-world medical imaging data. PIP-Net learns +human-understandable prototypical image parts and we evaluate its accuracy and +interpretability for fracture detection and skin cancer diagnosis. We find that +PIP-Net's decision making process is in line with medical classification +standards, while only provided with image-level class labels. Because of +PIP-Net's unsupervised pretraining of prototypes, data quality problems such as +undesired text in an X-ray or labelling errors can be easily identified. +Additionally, we are the first to show that humans can manually correct the +reasoning of PIP-Net by directly disabling undesired prototypes. We conclude +that part-prototype models are promising for medical applications due to their +interpretability and potential for advanced model debugging. + +
+
+ comment: Accepted to the International Workshop on Explainable and + Interpretable Machine Learning (XI-ML), co-located with ECAI 2023 +
+
+
+
+
+ + ♻ ☆ Multi-scale Wasserstein Shortest-path Filtration Kernels on Graphs + + +
+ The traditional shortest-path graph kernel (SP) is one of the most popular +graph kernels. It decomposes graphs into shortest paths and computes their +frequencies in each graph. However, SP has two main challenges: Firstly, the +triplet representation of the shortest path loses information. Secondly, SP +compares graphs without considering the multiple different scales of the graph +structure which is common in real-world graphs, e.g., the chain-, ring-, and +star-structures in social networks. To overcome these two challenges, we +develop a novel shortest-path graph kernel called the Multi-scale Wasserstein +Shortest-Path Filtration graph kernel (MWSPF). It uses a BFS tree of a certain +depth rooted at each vertex to restrict the maximum length of the shortest path +considering the small world property. It considers the labels of all the +vertices in the shortest path. To facilitate the comparison of graphs at +multiple different scales, it augments graphs from both the aspects of the +vertex and the graph structure. The distribution (frequency) of the shortest +path changes across augmented graphs and the Wasserstein distance is employed +to track the changes. We conduct experiments on various benchmark graph +datasets to evaluate MWSPF's performance. MWSPF is superior to the +state-of-the-art on most datasets. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ Multi-Task Pseudo-Label Learning for Non-Intrusive Speech Quality + Assessment Model + + +
+ This study proposes a multi-task pseudo-label learning (MPL)-based +non-intrusive speech quality assessment model called MTQ-Net. MPL consists of +two stages: obtaining pseudo-label scores from a pretrained model and +performing multi-task learning. The 3QUEST metrics, namely Speech-MOS (S-MOS), +Noise-MOS (N-MOS), and General-MOS (G-MOS), are the assessment targets. The +pretrained MOSA-Net model is utilized to estimate three pseudo labels: +perceptual evaluation of speech quality (PESQ), short-time objective +intelligibility (STOI), and speech distortion index (SDI). Multi-task learning +is then employed to train MTQ-Net by combining a supervised loss (derived from +the difference between the estimated score and the ground-truth label) and a +semi-supervised loss (derived from the difference between the estimated score +and the pseudo label), where the Huber loss is employed as the loss function. +Experimental results first demonstrate the advantages of MPL compared to +training a model from scratch and using a direct knowledge transfer mechanism. +Second, the benefit of the Huber loss for improving the predictive ability of +MTQ-Net is verified. Finally, the MTQ-Net with the MPL approach exhibits higher +overall predictive power compared to other SSL-based speech assessment models. + +
+
+
+
+
+ + ♻ ☆ Financial News Analytics Using Fine-Tuned Llama 2 GPT Model + + +
+ The paper considers the possibility to fine-tune Llama 2 GPT large language +model (LLM) for the multitask analysis of financial news. For fine-tuning, the +PEFT/LoRA based approach was used. In the study, the model was fine-tuned for +the following tasks: analysing a text from financial market perspectives, +highlighting main points of a text, summarizing a text and extracting named +entities with appropriate sentiments. The obtained results show that the +fine-tuned Llama 2 model can perform a multitask financial news analysis with a +specified structure of response, part of response can be a structured text and +another part of data can have JSON format for further processing. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models with quantitative target variables. + +
+
+
+
+
+ + ♻ ☆ SPEED: Streaming Partition and Parallel Acceleration for Temporal + Interaction Graph Embedding + + +
+ Temporal Interaction Graphs (TIGs) are widely employed to model intricate +real-world systems such as financial systems and social networks. To capture +the dynamism and interdependencies of nodes, existing TIG embedding models need +to process edges sequentially and chronologically. However, this requirement +prevents it from being processed in parallel and struggle to accommodate +burgeoning data volumes to GPU. Consequently, many large-scale temporal +interaction graphs are confined to CPU processing. Furthermore, a generalized +GPU scaling and acceleration approach remains unavailable. To facilitate +large-scale TIGs' implementation on GPUs for acceleration, we introduce a novel +training approach namely Streaming Edge Partitioning and Parallel Acceleration +for Temporal Interaction Graph Embedding (SPEED). The SPEED is comprised of a +Streaming Edge Partitioning Component (SEP) which addresses space overhead +issue by assigning fewer nodes to each GPU, and a Parallel Acceleration +Component (PAC) which enables simultaneous training of different sub-graphs, +addressing time overhead issue. Our method can achieve a good balance in +computing resources, computing time, and downstream task performance. Empirical +validation across 7 real-world datasets demonstrates the potential to expedite +training speeds by a factor of up to 19.29x. Simultaneously, resource +consumption of a single-GPU can be diminished by up to 69%, thus enabling the +multiple GPU-based training and acceleration encompassing millions of nodes and +billions of edges. Furthermore, our approach also maintains its competitiveness +in downstream tasks. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Deep neural networks with dependent weights: Gaussian Process mixture + limit, heavy tails, sparsity and compressibility + + +
+ This article studies the infinite-width limit of deep feedforward neural +networks whose weights are dependent, and modelled via a mixture of Gaussian +distributions. Each hidden node of the network is assigned a nonnegative random +variable that controls the variance of the outgoing weights of that node. We +make minimal assumptions on these per-node random variables: they are iid and +their sum, in each layer, converges to some finite random variable in the +infinite-width limit. Under this model, we show that each layer of the +infinite-width neural network can be characterised by two simple quantities: a +non-negative scalar parameter and a L\'evy measure on the positive reals. If +the scalar parameters are strictly positive and the L\'evy measures are trivial +at all hidden layers, then one recovers the classical Gaussian process (GP) +limit, obtained with iid Gaussian weights. More interestingly, if the L\'evy +measure of at least one layer is non-trivial, we obtain a mixture of Gaussian +processes (MoGP) in the large-width limit. The behaviour of the neural network +in this regime is very different from the GP regime. One obtains correlated +outputs, with non-Gaussian distributions, possibly with heavy tails. +Additionally, we show that, in this regime, the weights are compressible, and +some nodes have asymptotically non-negligible contributions, therefore +representing important hidden features. Many sparsity-promoting neural network +models can be recast as special cases of our approach, and we discuss their +infinite-width limits; we also present an asymptotic analysis of the pruning +error. We illustrate some of the benefits of the MoGP regime over the GP regime +in terms of representation learning and compressibility on simulated, MNIST and +Fashion MNIST datasets. + +
+
+ comment: 96 pages, 15 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ SYNAuG: Exploiting Synthetic Data for Data Imbalance Problems + + +
+ We live in an era of data floods, and deep neural networks play a pivotal +role in this moment. Natural data inherently exhibits several challenges such +as long-tailed distribution and model fairness, where data imbalance is at the +center of fundamental issues. This imbalance poses a risk of deep neural +networks producing biased predictions, leading to potentially severe ethical +and social problems. To address these problems, we leverage the recent +generative models advanced in generating high-quality images. In this work, we +propose SYNAuG, which utilizes synthetic data to uniformize the given imbalance +distribution followed by a simple post-calibration step considering the domain +gap between real and synthetic data. This straightforward approach yields +impressive performance on datasets for distinctive data imbalance problems such +as CIFAR100-LT, ImageNet100-LT, UTKFace, and Waterbirds, surpassing the +performance of existing task-specific methods. While we do not claim that our +approach serves as a complete solution to the problem of data imbalance, we +argue that supplementing the existing data with synthetic data proves to be an +effective and crucial step in addressing data imbalance concerns. + +
+
+
+
+
+ + ♻ ☆ Finite time analysis of temporal difference learning with linear + function approximation: Tail averaging and regularisation + + +
+ We study the finite-time behaviour of the popular temporal difference (TD) +learning algorithm when combined with tail-averaging. We derive finite time +bounds on the parameter error of the tail-averaged TD iterate under a step-size +choice that does not require information about the eigenvalues of the matrix +underlying the projected TD fixed point. Our analysis shows that tail-averaged +TD converges at the optimal $O\left(1/t\right)$ rate, both in expectation and +with high probability. In addition, our bounds exhibit a sharper rate of decay +for the initial error (bias), which is an improvement over averaging all +iterates. We also propose and analyse a variant of TD that incorporates +regularisation. From analysis, we conclude that the regularised version of TD +is useful for problems with ill-conditioned features. + +
+
+
+
+
+ + ♻ ☆ Stochastic Configuration Machines for Industrial Artificial Intelligence + + +
+ Real-time predictive modelling with desired accuracy is highly expected in +industrial artificial intelligence (IAI), where neural networks play a key +role. Neural networks in IAI require powerful, high-performance computing +devices to operate a large number of floating point data. Based on stochastic +configuration networks (SCNs), this paper proposes a new randomized learner +model, termed stochastic configuration machines (SCMs), to stress effective +modelling and data size saving that are useful and valuable for industrial +applications. Compared to SCNs and random vector functional-link (RVFL) nets +with binarized implementation, the model storage of SCMs can be significantly +compressed while retaining favourable prediction performance. Besides the +architecture of the SCM learner model and its learning algorithm, as an +important part of this contribution, we also provide a theoretical basis on the +learning capacity of SCMs by analysing the model's complexity. Experimental +studies are carried out over some benchmark datasets and three industrial +applications. The results demonstrate that SCM has great potential for dealing +with industrial data analytics. + +
+
+ comment: 23 pages, 7 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ Complexity-Optimized Sparse Bayesian Learning for Scalable + Classification Tasks + + +
+ Sparse Bayesian Learning (SBL) constructs an extremely sparse probabilistic +model with very competitive generalization. However, SBL needs to invert a big +covariance matrix with complexity $O(M^3)$ (M: feature size) for updating the +regularization priors, making it difficult for problems with high dimensional +feature space or large data size. As it may easily suffer from the memory +overflow issue in such problems. This paper addresses this issue with a newly +proposed diagonal Quasi-Newton (DQN) method for SBL called DQN-SBL where the +inversion of big covariance matrix is ignored so that the complexity is reduced +to $O(M)$. The DQN-SBL is thoroughly evaluated for non linear and linear +classifications with various benchmarks of different sizes. Experimental +results verify that DQN-SBL receives competitive generalization with a very +sparse model and scales well to large-scale problems. + +
+
+ comment: 12 pages,5 figures +
+
+
+
+
+ + ♻ ☆ In Search of netUnicorn: A Data-Collection Platform to Develop + Generalizable ML Models for Network Security Problems + + +
+ The remarkable success of the use of machine learning-based solutions for +network security problems has been impeded by the developed ML models' +inability to maintain efficacy when used in different network environments +exhibiting different network behaviors. This issue is commonly referred to as +the generalizability problem of ML models. The community has recognized the +critical role that training datasets play in this context and has developed +various techniques to improve dataset curation to overcome this problem. +Unfortunately, these methods are generally ill-suited or even counterproductive +in the network security domain, where they often result in unrealistic or +poor-quality datasets. + To address this issue, we propose an augmented ML pipeline that leverages +explainable ML tools to guide the network data collection in an iterative +fashion. To ensure the data's realism and quality, we require that the new +datasets should be endogenously collected in this iterative process, thus +advocating for a gradual removal of data-related problems to improve model +generalizability. To realize this capability, we develop a data-collection +platform, netUnicorn, that takes inspiration from the classic "hourglass" model +and is implemented as its "thin waist" to simplify data collection for +different learning problems from diverse network environments. The proposed +system decouples data-collection intents from the deployment mechanisms and +disaggregates these high-level intents into smaller reusable, self-contained +tasks. + We demonstrate how netUnicorn simplifies collecting data for different +learning problems from multiple network environments and how the proposed +iterative data collection improves a model's generalizability. + +
+
+
+
+
+ + ♻ ☆ Gotta match 'em all: Solution diversification in graph matching matched + filters + + +
+ We present a novel approach for finding multiple noisily embedded template +graphs in a very large background graph. Our method builds upon the +graph-matching-matched-filter technique proposed in Sussman et al., with the +discovery of multiple diverse matchings being achieved by iteratively +penalizing a suitable node-pair similarity matrix in the matched filter +algorithm. In addition, we propose algorithmic speed-ups that greatly enhance +the scalability of our matched-filter approach. We present theoretical +justification of our methodology in the setting of correlated Erdos-Renyi +graphs, showing its ability to sequentially discover multiple templates under +mild model conditions. We additionally demonstrate our method's utility via +extensive experiments both using simulated models and real-world dataset, +include human brain connectomes and a large transactional knowledge base. + +
+
+ comment: 36 pages, 12 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Learning Personalized Models with Clustered System Identification + + +
+ We address the problem of learning linear system models from observing +multiple trajectories from different system dynamics. This framework +encompasses a collaborative scenario where several systems seeking to estimate +their dynamics are partitioned into clusters according to their system +similarity. Thus, the systems within the same cluster can benefit from the +observations made by the others. Considering this framework, we present an +algorithm where each system alternately estimates its cluster identity and +performs an estimation of its dynamics. This is then aggregated to update the +model of each cluster. We show that under mild assumptions, our algorithm +correctly estimates the cluster identities and achieves an approximate sample +complexity that scales inversely with the number of systems in the cluster, +thus facilitating a more efficient and personalized system identification +process. + +
+
+
+
+
+ + ♻ ☆ From latent dynamics to meaningful representations + + +
+ While representation learning has been central to the rise of machine +learning and artificial intelligence, a key problem remains in making the +learnt representations meaningful. For this the typical approach is to +regularize the learned representation through prior probability distributions. +However such priors are usually unavailable or ad hoc. To deal with this, we +propose a dynamics-constrained representation learning framework. Instead of +using predefined probabilities, we restrict the latent representation to follow +specific dynamics, which is a more natural constraint for representation +learning in dynamical systems. Our belief stems from a fundamental observation +in physics that though different systems can have different marginalized +probability distributions, they typically obey the same dynamics, such as +Newton's and Schrodinger's equations. We validate our framework for different +systems including a real-world fluorescent DNA movie dataset. We show that our +algorithm can uniquely identify an uncorrelated, isometric and meaningful +latent representation. + +
+
+
+
+
+ + ♻ ☆ Efficient Learning of Quantum States Prepared With Few Non-Clifford + Gates + + +
+ We give an algorithm that efficiently learns a quantum state prepared by +Clifford gates and $O(\log(n))$ non-Clifford gates. Specifically, for an +$n$-qubit state $\lvert \psi \rangle$ prepared with at most $t$ non-Clifford +gates, we show that $\mathsf{poly}(n,2^t,1/\epsilon)$ time and copies of +$\lvert \psi \rangle$ suffice to learn $\lvert \psi \rangle$ to trace distance +at most $\epsilon$. This result follows as a special case of an algorithm for +learning states with large stabilizer dimension, where a quantum state has +stabilizer dimension $k$ if it is stabilized by an abelian group of $2^k$ Pauli +operators. We also develop an efficient property testing algorithm for +stabilizer dimension, which may be of independent interest. + +
+
+ comment: 25 pages. V3: Fixed typos +
+
+
+
+
+ + ♻ ☆ From Hope to Safety: Unlearning Biases of Deep Models by Enforcing the + Right Reasons in Latent Space + + +
+ Deep Neural Networks are prone to learning spurious correlations embedded in +the training data, leading to potentially biased predictions. This poses risks +when deploying these models for high-stake decision-making, such as in medical +applications. Current methods for post-hoc model correction either require +input-level annotations, which are only possible for spatially localized +biases, or augment the latent feature space, thereby hoping to enforce the +right reasons. We present a novel method ensuring the right reasons on the +concept level by reducing the model's sensitivity towards biases through the +gradient. When modeling biases via Concept Activation Vectors, we highlight the +importance of choosing robust directions, as traditional regression-based +approaches such as Support Vector Machines tend to result in diverging +directions. We effectively mitigate biases in controlled and real-world +settings on the ISIC, Bone Age, ImageNet and CelebA datasets using VGG, ResNet +and EfficientNet architectures. + +
+
+
+
+
+ + ♻ ☆ eXplainable Artificial Intelligence (XAI) in aging clock models + + +
+ eXplainable Artificial Intelligence (XAI) is a rapidly progressing field of +machine learning, aiming to unravel the predictions of complex models. XAI is +especially required in sensitive applications, e.g. in health care, when +diagnosis, recommendations and treatment choices might rely on the decisions +made by artificial intelligence systems. AI approaches have become widely used +in aging research as well, in particular, in developing biological clock models +and identifying biomarkers of aging and age-related diseases. However, the +potential of XAI here awaits to be fully appreciated. We discuss the +application of XAI for developing the "aging clocks" and present a +comprehensive analysis of the literature categorized by the focus on particular +physiological systems. + +
+
+
+
+
+ + ♻ ☆ Distributionally Robust Batch Contextual Bandits ICML 2020 + + +
+ Policy learning using historical observational data is an important problem +that has found widespread applications. Examples include selecting offers, +prices, advertisements to send to customers, as well as selecting which +medication to prescribe to a patient. However, existing literature rests on the +crucial assumption that the future environment where the learned policy will be +deployed is the same as the past environment that has generated the data -- an +assumption that is often false or too coarse an approximation. In this paper, +we lift this assumption and aim to learn a distributionally robust policy with +incomplete observational data. We first present a policy evaluation procedure +that allows us to assess how well the policy does under the worst-case +environment shift. We then establish a central limit theorem type guarantee for +this proposed policy evaluation scheme. Leveraging this evaluation scheme, we +further propose a novel learning algorithm that is able to learn a policy that +is robust to adversarial perturbations and unknown covariate shifts with a +performance guarantee based on the theory of uniform convergence. Finally, we +empirically test the effectiveness of our proposed algorithm in synthetic +datasets and demonstrate that it provides the robustness that is missing using +standard policy learning algorithms. We conclude the paper by providing a +comprehensive application of our methods in the context of a real-world voting +dataset. + +
+
+ comment: The short version has been accepted in ICML 2020 +
+
+
+
+
+ + ♻ ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+ + ♻ ☆ Out-of-distribution detection for regression tasks: parameter versus + predictor entropy + + +
+ It is crucial to detect when an instance lies downright too far from the +training samples for the machine learning model to be trusted, a challenge +known as out-of-distribution (OOD) detection. For neural networks, one approach +to this task consists of learning a diversity of predictors that all can +explain the training data. This information can be used to estimate the +epistemic uncertainty at a given newly observed instance in terms of a measure +of the disagreement of the predictions. Evaluation and certification of the +ability of a method to detect OOD require specifying instances which are likely +to occur in deployment yet on which no prediction is available. Focusing on +regression tasks, we choose a simple yet insightful model for this OOD +distribution and conduct an empirical evaluation of the ability of various +methods to discriminate OOD samples from the data. Moreover, we exhibit +evidence that a diversity of parameters may fail to translate to a diversity of +predictors. Based on the choice of an OOD distribution, we propose a new way of +estimating the entropy of a distribution on predictors based on nearest +neighbors in function space. This leads to a variational objective which, +combined with the family of distributions given by a generative neural network, +systematically produces a diversity of predictors that provides a robust way to +detect OOD samples. + +
+
+
+
+
+ + ♻ ☆ Deep Learning Models for Flood Predictions in South Florida + + +
+ Simulating and predicting water levels in river systems is essential for +flood warnings, hydraulic operations, and flood mitigations. In the engineering +field, tools such as HEC-RAS, MIKE, and SWMM are used to build detailed +physics-based hydrological and hydraulic computational models to simulate the +entire watershed, thereby predicting the water stage at any point in the +system. However, these physics-based models are computationally intensive, +especially for large watersheds and for longer simulations. To overcome this +problem, we train several deep learning (DL) models for use as surrogate models +to rapidly predict the water stage. The downstream stage of the Miami River in +South Florida is chosen as a case study for this paper. The dataset is from +January 1, 2010, to December 31, 2020, downloaded from the DBHYDRO database of +the South Florida Water Management District (SFWMD). Extensive experiments show +that the performance of the DL models is comparable to that of the +physics-based models, even during extreme precipitation conditions (i.e., +tropical storms). Furthermore, we study the decline in prediction accuracy of +the DL models with an increase in prediction lengths. In order to predict the +water stage in the future, our DL models use measured variables of the river +system from the recent past as well as covariates that can be reliably +predicted in the near future. In summary, the deep learning models achieve +comparable or better error rates with at least 1000x speedup in comparison to +the physics-based models. + +
+
+
+
+
+ + ♻ ☆ Multimodal Transformer for Material Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different combinations of +four different modalities: RGB, Angle of Linear Polarization (AoLP), Degree of +Linear Polarization (DoLP) and Near-Infrared (NIR). We also propose a new model +named Multi-Modal Segmentation Transformer (MMSFormer) that incorporates the +proposed fusion strategy to perform multimodal material segmentation. MMSFormer +achieves 52.05% mIoU outperforming the current state-of-the-art on Multimodal +Material Segmentation (MCubeS) dataset. For instance, our method provides +significant improvement in detecting gravel (+10.4%) and human (+9.1%) classes. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Axiomatic Aggregations of Abductive Explanations + + +
+ The recent criticisms of the robustness of post hoc model approximation +explanation methods (like LIME and SHAP) have led to the rise of model-precise +abductive explanations. For each data point, abductive explanations provide a +minimal subset of features that are sufficient to generate the outcome. While +theoretically sound and rigorous, abductive explanations suffer from a major +issue -- there can be several valid abductive explanations for the same data +point. In such cases, providing a single abductive explanation can be +insufficient; on the other hand, providing all valid abductive explanations can +be incomprehensible due to their size. In this work, we solve this issue by +aggregating the many possible abductive explanations into feature importance +scores. We propose three aggregation methods: two based on power indices from +cooperative game theory and a third based on a well-known measure of causal +strength. We characterize these three methods axiomatically, showing that each +of them uniquely satisfies a set of desirable properties. We also evaluate them +on multiple datasets and show that these explanations are robust to the attacks +that fool SHAP and LIME. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ From Capture to Display: A Survey on Volumetric Video + + +
+ Volumetric video, which offers immersive viewing experiences, is gaining +increasing prominence. With its six degrees of freedom, it provides viewers +with greater immersion and interactivity compared to traditional videos. +Despite their potential, volumetric video services poses significant +challenges. This survey conducts a comprehensive review of the existing +literature on volumetric video. We firstly provide a general framework of +volumetric video services, followed by a discussion on prerequisites for +volumetric video, encompassing representations, open datasets, and quality +assessment metrics. Then we delve into the current methodologies for each stage +of the volumetric video service pipeline, detailing capturing, compression, +transmission, rendering, and display techniques. Lastly, we explore various +applications enabled by this pioneering technology and we present an array of +research challenges and opportunities in the domain of volumetric video +services. This survey aspires to provide a holistic understanding of this +burgeoning field and shed light on potential future research trajectories, +aiming to bring the vision of volumetric video to fruition. + +
+
+ comment: Submitted +
+
+
+
+
+ + ☆ Temporal Action Localization with Enhanced Instant Discriminability CVPR + + +
+ Temporal action detection (TAD) aims to detect all action boundaries and +their corresponding categories in an untrimmed video. The unclear boundaries of +actions in videos often result in imprecise predictions of action boundaries by +existing methods. To resolve this issue, we propose a one-stage framework named +TriDet. First, we propose a Trident-head to model the action boundary via an +estimated relative probability distribution around the boundary. Then, we +analyze the rank-loss problem (i.e. instant discriminability deterioration) in +transformer-based methods and propose an efficient scalable-granularity +perception (SGP) layer to mitigate this issue. To further push the limit of +instant discriminability in the video backbone, we leverage the strong +representation capability of pretrained large models and investigate their +performance on TAD. Last, considering the adequate spatial-temporal context for +classification, we design a decoupled feature pyramid network with separate +feature pyramids to incorporate rich spatial context from the large model for +localization. Experimental results demonstrate the robustness of TriDet and its +state-of-the-art performance on multiple TAD datasets, including hierarchical +(multilabel) TAD datasets. + +
+
+ comment: An extended version of the CVPR paper arXiv:2303.07347, submitted to + IJCV +
+
+
+
+
+ + ☆ Dual-view Curricular Optimal Transport for Cross-lingual Cross-modal + Retrieval + + +
+ Current research on cross-modal retrieval is mostly English-oriented, as the +availability of a large number of English-oriented human-labeled +vision-language corpora. In order to break the limit of non-English labeled +data, cross-lingual cross-modal retrieval (CCR) has attracted increasing +attention. Most CCR methods construct pseudo-parallel vision-language corpora +via Machine Translation (MT) to achieve cross-lingual transfer. However, the +translated sentences from MT are generally imperfect in describing the +corresponding visual contents. Improperly assuming the pseudo-parallel data are +correctly correlated will make the networks overfit to the noisy +correspondence. Therefore, we propose Dual-view Curricular Optimal Transport +(DCOT) to learn with noisy correspondence in CCR. In particular, we quantify +the confidence of the sample pair correlation with optimal transport theory +from both the cross-lingual and cross-modal views, and design dual-view +curriculum learning to dynamically model the transportation costs according to +the learning stage of the two views. Extensive experiments are conducted on two +multilingual image-text datasets and one video-text dataset, and the results +demonstrate the effectiveness and robustness of the proposed method. Besides, +our proposed method also shows a good expansibility to cross-lingual image-text +baselines and a decent generalization on out-of-domain data. + +
+
+
+
+
+ + ☆ CANF-VC++: Enhancing Conditional Augmented Normalizing Flows for Video + Compression with Advanced Techniques + + +
+ Video has become the predominant medium for information dissemination, +driving the need for efficient video codecs. Recent advancements in learned +video compression have shown promising results, surpassing traditional codecs +in terms of coding efficiency. However, challenges remain in integrating +fragmented techniques and incorporating new tools into existing codecs. In this +paper, we comprehensively review the state-of-the-art CANF-VC codec and propose +CANF-VC++, an enhanced version that addresses these challenges. We +systematically explore architecture design, reference frame type, training +procedure, and entropy coding efficiency, leading to substantial coding +improvements. CANF-VC++ achieves significant Bj{\o}ntegaard-Delta rate savings +on conventional datasets UVG, HEVC Class B and MCL-JCV, outperforming the +baseline CANF-VC and even the H.266 reference software VTM. Our work +demonstrates the potential of integrating advancements in video compression and +serves as inspiration for future research in the field. + +
+
+
+
+
+ + ☆ Class-Incremental Grouping Network for Continual Audio-Visual Learning ICCV 2023 + + +
+ Continual learning is a challenging problem in which models need to be +trained on non-stationary data across sequential tasks for class-incremental +learning. While previous methods have focused on using either regularization or +rehearsal-based frameworks to alleviate catastrophic forgetting in image +classification, they are limited to a single modality and cannot learn compact +class-aware cross-modal representations for continual audio-visual learning. To +address this gap, we propose a novel class-incremental grouping network (CIGN) +that can learn category-wise semantic features to achieve continual +audio-visual learning. Our CIGN leverages learnable audio-visual class tokens +and audio-visual grouping to continually aggregate class-aware features. +Additionally, it utilizes class tokens distillation and continual grouping to +prevent forgetting parameters learned from previous tasks, thereby improving +the model's ability to capture discriminative audio-visual categories. We +conduct extensive experiments on VGGSound-Instruments, VGGSound-100, and +VGG-Sound Sources benchmarks. Our experimental results demonstrate that the +CIGN achieves state-of-the-art audio-visual class-incremental learning +performance. Code is available at https://github.com/stoneMo/CIGN. + +
+
+ comment: ICCV 2023. arXiv admin note: text overlap with arXiv:2303.17056 +
+
+
+
+
+ + ♻ ☆ Noise-Tolerant Learning for Audio-Visual Action Recognition + + +
+ Recently, video recognition is emerging with the help of multi-modal +learning, which focuses on integrating distinct modalities to improve the +performance or robustness of the model. Although various multi-modal learning +methods have been proposed and offer remarkable recognition results, almost all +of these methods rely on high-quality manual annotations and assume that +modalities among multi-modal data provide semantically relevant information. +Unfortunately, the widely used video datasets are usually coarse-annotated or +collected from the Internet. Thus, it inevitably contains a portion of noisy +labels and noisy correspondence. To address this challenge, we use the +audio-visual action recognition task as a proxy and propose a noise-tolerant +learning framework to find anti-interference model parameters against both +noisy labels and noisy correspondence. Specifically, our method consists of two +phases that aim to rectify noise by the inherent correlation between +modalities. First, a noise-tolerant contrastive training phase is performed to +make the model immune to the possible noisy-labeled data. To alleviate the +influence of noisy correspondence, we propose a cross-modal noise estimation +component to adjust the consistency between different modalities. As the noisy +correspondence existed at the instance level, we further propose a +category-level contrastive loss to reduce its interference. Second, in the +hybrid-supervised training phase, we calculate the distance metric among +features to obtain corrected labels, which are used as complementary +supervision to guide the training. Extensive experiments on a wide range of +noisy levels demonstrate that our method significantly improves the robustness +of the action recognition model and surpasses the baselines by a clear margin. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ ImageBind-LLM: Multi-modality Instruction Tuning + + +
+ We present ImageBind-LLM, a multi-modality instruction tuning method of large +language models (LLMs) via ImageBind. Existing works mainly focus on language +and image instruction tuning, different from which, our ImageBind-LLM can +respond to multi-modality conditions, including audio, 3D point clouds, video, +and their embedding-space arithmetic by only image-text alignment training. +During training, we adopt a learnable bind network to align the embedding space +between LLaMA and ImageBind's image encoder. Then, the image features +transformed by the bind network are added to word tokens of all layers in +LLaMA, which progressively injects visual instructions via an attention-free +and zero-initialized gating mechanism. Aided by the joint embedding of +ImageBind, the simple image-text training enables our model to exhibit superior +multi-modality instruction-following capabilities. During inference, the +multi-modality inputs are fed into the corresponding ImageBind encoders, and +processed by a proposed visual cache model for further cross-modal embedding +enhancement. The training-free cache model retrieves from three million image +features extracted by ImageBind, which effectively mitigates the +training-inference modality discrepancy. Notably, with our approach, +ImageBind-LLM can respond to instructions of diverse modalities and demonstrate +significant language generation quality. Code is released at +https://github.com/OpenGVLab/LLaMA-Adapter. + +
+
+ comment: Code is available at https://github.com/OpenGVLab/LLaMA-Adapter +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 25 + +
+
+
+ + ☆ Collecting Visually-Grounded Dialogue with A Game Of Sorts LREC 2022 + + +
+ An idealized, though simplistic, view of the referring expression production +and grounding process in (situated) dialogue assumes that a speaker must merely +appropriately specify their expression so that the target referent may be +successfully identified by the addressee. However, referring in conversation is +a collaborative process that cannot be aptly characterized as an exchange of +minimally-specified referring expressions. Concerns have been raised regarding +assumptions made by prior work on visually-grounded dialogue that reveal an +oversimplified view of conversation and the referential process. We address +these concerns by introducing a collaborative image ranking task, a grounded +agreement game we call "A Game Of Sorts". In our game, players are tasked with +reaching agreement on how to rank a set of images given some sorting criterion +through a largely unrestricted, role-symmetric dialogue. By putting emphasis on +the argumentation in this mixed-initiative interaction, we collect discussions +that involve the collaborative referential process. We describe results of a +small-scale data collection experiment with the proposed task. All discussed +materials, which includes the collected data, the codebase, and a containerized +version of the application, are publicly available. + +
+
+ comment: Published at LREC 2022 +
+
+
+
+
+ + ☆ Large Language Models for Difficulty Estimation of Foreign Language + Content with Application to Language Learning + + +
+ We use large language models to aid learners enhance proficiency in a foreign +language. This is accomplished by identifying content on topics that the user +is interested in, and that closely align with the learner's proficiency level +in that foreign language. Our work centers on French content, but our approach +is readily transferable to other languages. Our solution offers several +distinctive characteristics that differentiate it from existing +language-learning solutions, such as, a) the discovery of content across topics +that the learner cares about, thus increasing motivation, b) a more precise +estimation of the linguistic difficulty of the content than traditional +readability measures, and c) the availability of both textual and video-based +content. The linguistic complexity of video content is derived from the video +captions. It is our aspiration that such technology will enable learners to +remain engaged in the language-learning process by continuously adapting the +topics and the difficulty of the content to align with the learners' evolving +interests and learning objectives. + +
+
+
+
+
+ + ☆ AGent: A Novel Pipeline for Automatically Creating Unanswerable + Questions + + +
+ The development of large high-quality datasets and high-performing models +have led to significant advancements in the domain of Extractive Question +Answering (EQA). This progress has sparked considerable interest in exploring +unanswerable questions within the EQA domain. Training EQA models with +unanswerable questions helps them avoid extracting misleading or incorrect +answers for queries that lack valid responses. However, manually annotating +unanswerable questions is labor-intensive. To address this, we propose AGent, a +novel pipeline that automatically creates new unanswerable questions by +re-matching a question with a context that lacks the necessary information for +a correct answer. In this paper, we demonstrate the usefulness of this AGent +pipeline by creating two sets of unanswerable questions from answerable +questions in SQuAD and HotpotQA. These created question sets exhibit low error +rates. Additionally, models fine-tuned on these questions show comparable +performance with those fine-tuned on the SQuAD 2.0 dataset on multiple EQA +benchmarks. + +
+
+ comment: 16 pages, 10 tables, 3 figures +
+
+
+
+
+ + ☆ Neural-Hidden-CRF: A Robust Weakly-Supervised Sequence Labeler KDD-2023 + + +
+ We propose a neuralized undirected graphical model called Neural-Hidden-CRF +to solve the weakly-supervised sequence labeling problem. Under the umbrella of +probabilistic undirected graph theory, the proposed Neural-Hidden-CRF embedded +with a hidden CRF layer models the variables of word sequence, latent ground +truth sequence, and weak label sequence with the global perspective that +undirected graphical models particularly enjoy. In Neural-Hidden-CRF, we can +capitalize on the powerful language model BERT or other deep models to provide +rich contextual semantic knowledge to the latent ground truth sequence, and use +the hidden CRF layer to capture the internal label dependencies. +Neural-Hidden-CRF is conceptually simple and empirically powerful. It obtains +new state-of-the-art results on one crowdsourcing benchmark and three +weak-supervision benchmarks, including outperforming the recent advanced model +CHMM by 2.80 F1 points and 2.23 F1 points in average generalization and +inference performance, respectively. + +
+
+ comment: 13 pages, 4 figures, accepted by SIGKDD-2023 +
+
+
+
+
+ + ☆ An Appraisal-Based Chain-Of-Emotion Architecture for Affective Language + Model Game Agents + + +
+ The development of believable, natural, and interactive digital artificial +agents is a field of growing interest. Theoretical uncertainties and technical +barriers present considerable challenges to the field, particularly with +regards to developing agents that effectively simulate human emotions. Large +language models (LLMs) might address these issues by tapping common patterns in +situational appraisal. In three empirical experiments, this study tests the +capabilities of LLMs to solve emotional intelligence tasks and to simulate +emotions. It presents and evaluates a new chain-of-emotion architecture for +emotion simulation within video games, based on psychological appraisal +research. Results show that it outperforms standard LLM architectures on a +range of user experience and content analysis metrics. This study therefore +provides early evidence of how to construct and test affective agents based on +cognitive processes represented in language models. + +
+
+
+
+
+ + ☆ The Effect of Alignment Objectives on Code-Switching Translation + + +
+ One of the things that need to change when it comes to machine translation is +the models' ability to translate code-switching content, especially with the +rise of social media and user-generated content. In this paper, we are +proposing a way of training a single machine translation model that is able to +translate monolingual sentences from one language to another, along with +translating code-switched sentences to either language. This model can be +considered a bilingual model in the human sense. For better use of parallel +data, we generated synthetic code-switched (CSW) data along with an alignment +loss on the encoder to align representations across languages. Using the WMT14 +English-French (En-Fr) dataset, the trained model strongly outperforms +bidirectional baselines on code-switched translation while maintaining quality +for non-code-switched (monolingual) data. + +
+
+ comment: This paper was originally submitted on 30/06/2022 +
+
+
+
+
+ + ☆ Chat2Brain: A Method for Mapping Open-Ended Semantic Queries to Brain + Activation Maps + + +
+ Over decades, neuroscience has accumulated a wealth of research results in +the text modality that can be used to explore cognitive processes. +Meta-analysis is a typical method that successfully establishes a link from +text queries to brain activation maps using these research results, but it +still relies on an ideal query environment. In practical applications, text +queries used for meta-analyses may encounter issues such as semantic redundancy +and ambiguity, resulting in an inaccurate mapping to brain images. On the other +hand, large language models (LLMs) like ChatGPT have shown great potential in +tasks such as context understanding and reasoning, displaying a high degree of +consistency with human natural language. Hence, LLMs could improve the +connection between text modality and neuroscience, resolving existing +challenges of meta-analyses. In this study, we propose a method called +Chat2Brain that combines LLMs to basic text-2-image model, known as Text2Brain, +to map open-ended semantic queries to brain activation maps in data-scarce and +complex query environments. By utilizing the understanding and reasoning +capabilities of LLMs, the performance of the mapping model is optimized by +transferring text queries to semantic queries. We demonstrate that Chat2Brain +can synthesize anatomically plausible neural activation patterns for more +complex tasks of text queries. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ FOLLOWUPQG: Towards Information-Seeking Follow-up Question Generation + + +
+ Humans ask follow-up questions driven by curiosity, which reflects a creative +human cognitive process. We introduce the task of real-world +information-seeking follow-up question generation (FQG), which aims to generate +follow-up questions seeking a more in-depth understanding of an initial +question and answer. We construct FOLLOWUPQG, a dataset of over 3K real-world +(initial question, answer, follow-up question) tuples collected from a Reddit +forum providing layman-friendly explanations for open-ended questions. In +contrast to existing datasets, questions in FOLLOWUPQG use more diverse +pragmatic strategies to seek information, and they also show higher-order +cognitive skills (such as applying and relating). We evaluate current question +generation models on their efficacy for generating follow-up questions, +exploring how to generate specific types of follow-up questions based on +step-by-step demonstrations. Our results validate FOLLOWUPQG as a challenging +benchmark, as model-generated questions are adequate but far from human-raised +questions in terms of informativeness and complexity. + +
+
+
+
+
+ + ☆ Mitigating Word Bias in Zero-shot Prompt-based Classifiers + + +
+ Prompt-based classifiers are an attractive approach for zero-shot +classification. However, the precise choice of the prompt template and label +words can largely influence performance, with semantically equivalent settings +often showing notable performance difference. This discrepancy can be partly +attributed to word biases, where the classifier may be biased towards classes. +To address this problem, it is possible to optimise classification thresholds +on a labelled data set, however, this mitigates some of the advantages of +prompt-based classifiers. This paper instead approaches this problem by +examining the expected marginal probabilities of the classes. Here, +probabilities are reweighted to have a uniform prior over classes, in an +unsupervised fashion. Further, we draw a theoretical connection between the +class priors and the language models' word prior, and offer the ability to set +a threshold in a zero-resource fashion. We show that matching class priors +correlates strongly with the oracle upper bound performance and demonstrate +large consistent performance gains for prompt settings over a range of NLP +tasks. + +
+
+
+
+
+ + ☆ Retrieval-Augmented Meta Learning for Low-Resource Text Classification + + +
+ Meta learning have achieved promising performance in low-resource text +classification which aims to identify target classes with knowledge transferred +from source classes with sets of small tasks named episodes. However, due to +the limited training data in the meta-learning scenario and the inherent +properties of parameterized neural networks, poor generalization performance +has become a pressing problem that needs to be addressed. To deal with this +issue, we propose a meta-learning based method called Retrieval-Augmented Meta +Learning(RAML). It not only uses parameterization for inference but also +retrieves non-parametric knowledge from an external corpus to make inferences, +which greatly alleviates the problem of poor generalization performance caused +by the lack of diverse training data in meta-learning. This method differs from +previous models that solely rely on parameters, as it explicitly emphasizes the +importance of non-parametric knowledge, aiming to strike a balance between +parameterized neural networks and non-parametric knowledge. The model is +required to determine which knowledge to access and utilize during inference. +Additionally, our multi-view passages fusion network module can effectively and +efficiently integrate the retrieved information into low-resource +classification task. The extensive experiments demonstrate that RAML +significantly outperforms current SOTA low-resource text classification models. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ RGAT: A Deeper Look into Syntactic Dependency Information for + Coreference Resolution + + +
+ Although syntactic information is beneficial for many NLP tasks, combining it +with contextual information between words to solve the coreference resolution +problem needs to be further explored. In this paper, we propose an end-to-end +parser that combines pre-trained BERT with a Syntactic Relation Graph Attention +Network (RGAT) to take a deeper look into the role of syntactic dependency +information for the coreference resolution task. In particular, the RGAT model +is first proposed, then used to understand the syntactic dependency graph and +learn better task-specific syntactic embeddings. An integrated architecture +incorporating BERT embeddings and syntactic embeddings is constructed to +generate blending representations for the downstream task. Our experiments on a +public Gendered Ambiguous Pronouns (GAP) dataset show that with the supervision +learning of the syntactic dependency graph and without fine-tuning the entire +BERT, we increased the F1-score of the previous best model (RGCN-with-BERT) +from 80.3% to 82.5%, compared to the F1-score by single BERT embeddings from +78.5% to 82.5%. Experimental results on another public dataset - OntoNotes 5.0 +demonstrate that the performance of the model is also improved by incorporating +syntactic dependency information learned from RGAT. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Prompt Learning With Knowledge Memorizing Prototypes For Generalized + Few-Shot Intent Detection + + +
+ Generalized Few-Shot Intent Detection (GFSID) is challenging and realistic +because it needs to categorize both seen and novel intents simultaneously. +Previous GFSID methods rely on the episodic learning paradigm, which makes it +hard to extend to a generalized setup as they do not explicitly learn the +classification of seen categories and the knowledge of seen intents. To address +the dilemma, we propose to convert the GFSID task into the class incremental +learning paradigm. Specifically, we propose a two-stage learning framework, +which sequentially learns the knowledge of different intents in various periods +via prompt learning. And then we exploit prototypes for categorizing both seen +and novel intents. Furthermore, to achieve the transfer knowledge of intents in +different stages, for different scenarios we design two knowledge preservation +methods which close to realistic applications. Extensive experiments and +detailed analyses on two widely used datasets show that our framework based on +the class incremental learning paradigm achieves promising performance. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image + Captioning + + +
+ While impressive performance has been achieved in image captioning, the +limited diversity of the generated captions and the large parameter scale +remain major barriers to the real-word application of these systems. In this +work, we propose a lightweight image captioning network in combination with +continuous diffusion, called Prefix-diffusion. To achieve diversity, we design +an efficient method that injects prefix image embeddings into the denoising +process of the diffusion model. In order to reduce trainable parameters, we +employ a pre-trained model to extract image features and further design an +extra mapping network. Prefix-diffusion is able to generate diverse captions +with relatively less parameters, while maintaining the fluency and relevance of +the captions benefiting from the generative capabilities of the diffusion +model. Our work paves the way for scaling up diffusion models for image +captioning, and achieves promising performance compared with recent approaches. + +
+
+ comment: 11 pages,4 figures, 6 tables +
+
+
+
+
+ + ☆ Multi-document Summarization: A Comparative Evaluation + + +
+ This paper is aimed at evaluating state-of-the-art models for Multi-document +Summarization (MDS) on different types of datasets in various domains and +investigating the limitations of existing models to determine future research +directions. To address this gap, we conducted an extensive literature review to +identify state-of-the-art models and datasets. We analyzed the performance of +PRIMERA and PEGASUS models on BigSurvey-MDS and MS$^2$ datasets, which posed +unique challenges due to their varied domains. Our findings show that the +General-Purpose Pre-trained Model LED outperforms PRIMERA and PEGASUS on the +MS$^2$ dataset. We used the ROUGE score as a performance metric to evaluate the +identified models on different datasets. Our study provides valuable insights +into the models' strengths and weaknesses, as well as their applicability in +different domains. This work serves as a reference for future MDS research and +contributes to the development of accurate and robust models which can be +utilized on demanding datasets with academically and/or scientifically complex +data as well as generalized, relatively simple datasets. + +
+
+
+
+
+ + ☆ What's Hard in English RST Parsing? Predictive Models for Error Analysis SIGDIAL 2023 + + +
+ Despite recent advances in Natural Language Processing (NLP), hierarchical +discourse parsing in the framework of Rhetorical Structure Theory remains +challenging, and our understanding of the reasons for this are as yet limited. +In this paper, we examine and model some of the factors associated with parsing +difficulties in previous work: the existence of implicit discourse relations, +challenges in identifying long-distance relations, out-of-vocabulary items, and +more. In order to assess the relative importance of these variables, we also +release two annotated English test-sets with explicit correct and distracting +discourse markers associated with gold standard RST relations. Our results show +that as in shallow discourse parsing, the explicit/implicit distinction plays a +role, but that long-distance dependencies are the main challenge, while lack of +lexical overlap is less of a problem, at least for in-domain parsing. Our final +model is able to predict where errors will occur with an accuracy of 76.3% for +the bottom-up parser and 76.6% for the top-down parser. + +
+
+ comment: SIGDIAL 2023 camera-ready; 12 pages +
+
+
+
+
+ + ☆ Unsupervised Chunking with Hierarchical RNN + + +
+ In Natural Language Processing (NLP), predicting linguistic structures, such +as parsing and chunking, has mostly relied on manual annotations of syntactic +structures. This paper introduces an unsupervised approach to chunking, a +syntactic task that involves grouping words in a non-hierarchical manner. We +present a two-layer Hierarchical Recurrent Neural Network (HRNN) designed to +model word-to-chunk and chunk-to-sentence compositions. Our approach involves a +two-stage training process: pretraining with an unsupervised parser and +finetuning on downstream NLP tasks. Experiments on the CoNLL-2000 dataset +reveal a notable improvement over existing unsupervised methods, enhancing +phrase F1 score by up to 6 percentage points. Further, finetuning with +downstream tasks results in an additional performance improvement. +Interestingly, we observe that the emergence of the chunking structure is +transient during the neural model's downstream-task training. This study +contributes to the advancement of unsupervised syntactic structure discovery +and opens avenues for further research in linguistic theory. + +
+
+
+
+
+ + ♻ ☆ Discover, Explanation, Improvement: An Automatic Slice Detection + Framework for Natural Language Processing + + +
+ Pretrained natural language processing (NLP) models have achieved high +overall performance, but they still make systematic errors. Instead of manual +error analysis, research on slice detection models (SDM), which automatically +identify underperforming groups of datapoints, has caught escalated attention +in Computer Vision for both understanding model behaviors and providing +insights for future model training and designing. However, little research on +SDM and quantitative evaluation of their effectiveness have been conducted on +NLP tasks. Our paper fills the gap by proposing a benchmark named "Discover, +Explain, Improve (DEIM)" for classification NLP tasks along with a new SDM +Edisa. Edisa discovers coherent and underperforming groups of datapoints; DEIM +then unites them under human-understandable concepts and provides comprehensive +evaluation tasks and corresponding quantitative metrics. The evaluation in DEIM +shows that Edisa can accurately select error-prone datapoints with informative +semantic features that summarize error patterns. Detecting difficult datapoints +directly boosts model performance without tuning any original model parameters, +showing that discovered slices are actionable for users. + +
+
+ comment: 15 pages, 5 figures, accepted by Transactions of the Association for + Computational Linguistics +
+
+
+
+
+ + ♻ ☆ Exploring Large Language Models for Knowledge Graph Completion + + +
+ Knowledge graphs play a vital role in numerous artificial intelligence tasks, +yet they frequently face the issue of incompleteness. In this study, we explore +utilizing Large Language Models (LLM) for knowledge graph completion. We +consider triples in knowledge graphs as text sequences and introduce an +innovative framework called Knowledge Graph LLM (KG-LLM) to model these +triples. Our technique employs entity and relation descriptions of a triple as +prompts and utilizes the response for predictions. Experiments on various +benchmark knowledge graphs demonstrate that our method attains state-of-the-art +performance in tasks such as triple classification and relation prediction. We +also find that fine-tuning relatively smaller models (e.g., LLaMA-7B, +ChatGLM-6B) outperforms recent ChatGPT and GPT-4. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ HopPG: Self-Iterative Program Generation for Multi-Hop Question + Answering over Heterogeneous Knowledge + + +
+ The semantic parsing-based method is an important research branch for +knowledge-based question answering. It usually generates executable programs +lean upon the question and then conduct them to reason answers over a knowledge +base. Benefit from this inherent mechanism, it has advantages in the +performance and the interpretability. However, traditional semantic parsing +methods usually generate a complete program before executing it, which +struggles with multi-hop question answering over heterogeneous knowledge. On +one hand, generating a complete multi-hop program relies on multiple +heterogeneous supporting facts, and it is difficult for generators to +understand these facts simultaneously. On the other hand, this way ignores the +semantic information of the intermediate answers at each hop, which is +beneficial for subsequent generation. To alleviate these challenges, we propose +a self-iterative framework for multi-hop program generation (HopPG) over +heterogeneous knowledge, which leverages the previous execution results to +retrieve supporting facts and generate subsequent programs hop by hop. We +evaluate our model on MMQA-T^2, and the experimental results show that HopPG +outperforms existing semantic-parsing-based baselines, especially on the +multi-hop questions. + +
+
+
+
+
+ + ♻ ☆ What can Large Language Models do in chemistry? A comprehensive + benchmark on eight tasks + + +
+ Large Language Models (LLMs) with strong abilities in natural language +processing tasks have emerged and have been applied in various kinds of areas +such as science, finance and software engineering. However, the capability of +LLMs to advance the field of chemistry remains unclear. In this paper, rather +than pursuing state-of-the-art performance, we aim to evaluate capabilities of +LLMs in a wide range of tasks across the chemistry domain. We identify three +key chemistry-related capabilities including understanding, reasoning and +explaining to explore in LLMs and establish a benchmark containing eight +chemistry tasks. Our analysis draws on widely recognized datasets facilitating +a broad exploration of the capacities of LLMs within the context of practical +chemistry. Five LLMs (GPT-4, GPT-3.5, Davinci-003, Llama and Galactica) are +evaluated for each chemistry task in zero-shot and few-shot in-context learning +settings with carefully selected demonstration examples and specially crafted +prompts. Our investigation found that GPT-4 outperformed other models and LLMs +exhibit different competitive levels in eight chemistry tasks. In addition to +the key findings from the comprehensive benchmark analysis, our work provides +insights into the limitation of current LLMs and the impact of in-context +learning settings on LLMs' performance across various chemistry tasks. The code +and datasets used in this study are available at +https://github.com/ChemFoundationModels/ChemLLMBench. + +
+
+ comment: Add extra LLMs experiments; more baselines and more investigations on + SELFIES, label interpretation, etc +
+
+
+
+
+ + ♻ ☆ CodeApex: A Bilingual Programming Evaluation Benchmark for Large + Language Models + + +
+ With the emergence of Large Language Models (LLMs), there has been a +significant improvement in the programming capabilities of models, attracting +growing attention from researchers. We propose CodeApex, a bilingual benchmark +dataset focusing on the programming comprehension and code generation abilities +of LLMs. CodeApex comprises three types of multiple-choice questions: +conceptual understanding, commonsense reasoning, and multi-hop reasoning, +designed to evaluate LLMs on programming comprehension tasks. Additionally, +CodeApex utilizes algorithmic questions and corresponding test cases to assess +the code quality generated by LLMs. We evaluate 14 state-of-the-art LLMs, +including both general-purpose and specialized models. GPT exhibits the best +programming capabilities, achieving approximate accuracies of 50% and 56% on +the two tasks, respectively. There is still significant room for improvement in +programming tasks. We hope that CodeApex can serve as a reference for +evaluating the coding capabilities of LLMs, further promoting their development +and growth. Datasets are released at https://github.com/APEXLAB/CodeApex.git. +CodeApex submission website is https://apex.sjtu.edu.cn/codeapex/. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ♻ ☆ Evaluating Human-Language Model Interaction + + +
+ Many real-world applications of language models (LMs), such as writing +assistance and code autocomplete, involve human-LM interaction. However, most +benchmarks are non-interactive in that a model produces output without human +involvement. To evaluate human-LM interaction, we develop a new framework, +Human-AI Language-based Interaction Evaluation (HALIE), that defines the +components of interactive systems and dimensions to consider when designing +evaluation metrics. Compared to standard, non-interactive evaluation, HALIE +captures (i) the interactive process, not only the final output; (ii) the +first-person subjective experience, not just a third-party assessment; and +(iii) notions of preference beyond quality (e.g., enjoyment and ownership). We +then design five tasks to cover different forms of interaction: social +dialogue, question answering, crossword puzzles, summarization, and metaphor +generation. With four state-of-the-art LMs (three variants of OpenAI's GPT-3 +and AI21 Labs' Jurassic-1), we find that better non-interactive performance +does not always translate to better human-LM interaction. In particular, we +highlight three cases where the results from non-interactive and interactive +metrics diverge and underscore the importance of human-LM interaction for LM +evaluation. + +
+
+ comment: Authored by the Center for Research on Foundation Models (CRFM) at + the Stanford Institute for Human-Centered Artificial Intelligence (HAI) +
+
+
+
+
+ + ♻ ☆ CamChoice: A Corpus of Multiple Choice Questions and Candidate Response + Distributions + + +
+ Multiple choice exams are widely used to assess candidates across a diverse +range of domains and tasks. To moderate question quality, newly proposed +questions often pass through pre-test evaluation stages before being deployed +into real-world exams. Currently, this evaluation process is manually +intensive, which can lead to time lags in the question development cycle. +Streamlining this process via automation can significantly enhance efficiency, +however, there's a current lack of datasets with adequate pre-test analysis +information. In this paper we introduce CamChoice; a multiple-choice +comprehension dataset of questions at different target levels, with +corresponding candidate selection distributions. We introduce the task of +candidate distribution matching, propose several evaluation metrics for the +task, and demonstrate that automatic systems trained on RACE++ can be leveraged +as baselines for our task. We further demonstrate that these automatic systems +can be used for practical pre-test evaluation tasks such as detecting +underperforming distractors, where our detection systems can automatically +identify poor distractors that few candidates select. We release the data +publicly for future research. + +
+
+
+
+
+ + ♻ ☆ Challenging the Machinery of Generative AI with Fact-Checking: + Ontology-Driven Biological Graphs for Verifying Human Disease-Gene Links + + +
+ Methods: we adopted a biological networks approach that enables the +systematic interrogation of ChatGPT's linked entities. In particular, we +designed an ontology-driven fact-checking algorithm that compares biological +graphs constructed from approximately 200,000 PubMed abstracts with +counterparts constructed from a dataset generated using the ChatGPT-3.5 Turbo +model. The nodes refer to biological entities (genes and diseases) that occur +in the text. The edges represent the co-occurrence relationships of two +entities mentioned in the same document, weighted by the proximity distance +between these two entities. This research assumes a ``closed-world +assumption'', meaning that fact-checking is performed only using the literature +dataset as our ground truth. Results: in ten samples of 250 randomly selected +records from the ChatGPT dataset of 1000 ``simulated'' articles , the +fact-checking link accuracy ranged from 70% to 86%, while the remainder of the +links remained unverified. Given the closed world assumption, the fact-checking +precision is significant. When measuring and comparing the proximity distances +of the edges of literature graphs against ChatGPT graphs we found that the +ChatGPT distances were significantly shorter (ranging from 90 to 153) character +distance. In contrast, the proximity distance of biological entities identified +in the literature ranged from 236 to 765 character distance. This pattern held +true for all the relationships among biological entities in the ten samples. +Conclusion: this study demonstrated a reasonably high percentage accuracy of +aggregate fact-checking of disease-gene relationships found in +ChatGPT-generated texts. The strikingly consistent pattern of short proximity +distances across all samples offers an illuminating feedback to the biological +knowledge we possess in the literature today. + +
+
+ comment: 9 Pages, 3 algorithms, 5 tables, and 8 figures +
+
+
+
+
+ + ♻ ☆ A Survey of Knowledge Enhanced Pre-trained Models + + +
+ Pre-trained language models learn informative word representations on a +large-scale text corpus through self-supervised learning, which has achieved +promising performance in fields of natural language processing (NLP) after +fine-tuning. These models, however, suffer from poor robustness and lack of +interpretability. We refer to pre-trained language models with knowledge +injection as knowledge-enhanced pre-trained language models (KEPLMs). These +models demonstrate deep understanding and logical reasoning and introduce +interpretability. In this survey, we provide a comprehensive overview of KEPLMs +in NLP. We first discuss the advancements in pre-trained language models and +knowledge representation learning. Then we systematically categorize existing +KEPLMs from three different perspectives. Finally, we outline some potential +directions of KEPLMs for future research. + +
+
+ comment: 32 pages, 15 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 44 + +
+
+
+ + ☆ Collecting Visually-Grounded Dialogue with A Game Of Sorts LREC 2022 + + +
+ An idealized, though simplistic, view of the referring expression production +and grounding process in (situated) dialogue assumes that a speaker must merely +appropriately specify their expression so that the target referent may be +successfully identified by the addressee. However, referring in conversation is +a collaborative process that cannot be aptly characterized as an exchange of +minimally-specified referring expressions. Concerns have been raised regarding +assumptions made by prior work on visually-grounded dialogue that reveal an +oversimplified view of conversation and the referential process. We address +these concerns by introducing a collaborative image ranking task, a grounded +agreement game we call "A Game Of Sorts". In our game, players are tasked with +reaching agreement on how to rank a set of images given some sorting criterion +through a largely unrestricted, role-symmetric dialogue. By putting emphasis on +the argumentation in this mixed-initiative interaction, we collect discussions +that involve the collaborative referential process. We describe results of a +small-scale data collection experiment with the proposed task. All discussed +materials, which includes the collected data, the codebase, and a containerized +version of the application, are publicly available. + +
+
+ comment: Published at LREC 2022 +
+
+
+
+
+ + ☆ Faster, Lighter, More Accurate: A Deep Learning Ensemble for Content + Moderation ICML + + +
+ To address the increasing need for efficient and accurate content moderation, +we propose an efficient and lightweight deep classification ensemble structure. +Our approach is based on a combination of simple visual features, designed for +high-accuracy classification of violent content with low false positives. Our +ensemble architecture utilizes a set of lightweight models with narrowed-down +color features, and we apply it to both images and videos. + We evaluated our approach using a large dataset of explosion and blast +contents and compared its performance to popular deep learning models such as +ResNet-50. Our evaluation results demonstrate significant improvements in +prediction accuracy, while benefiting from 7.64x faster inference and lower +computation cost. + While our approach is tailored to explosion detection, it can be applied to +other similar content moderation and violence detection use cases as well. +Based on our experiments, we propose a "think small, think many" philosophy in +classification scenarios. We argue that transforming a single, large, +monolithic deep model into a verification-based step model ensemble of multiple +small, simple, and lightweight models with narrowed-down visual features can +possibly lead to predictions with higher accuracy. + +
+
+ comment: 6 pages, 22nd IEEE International Conference on Machine Learning and + Applications (IEEE ICMLA'23), December 15-17, 2023, Jacksonville Riverfront, + Florida, USA. arXiv admin note: substantial text overlap with + arXiv:2103.10350 +
+
+
+
+
+ + ☆ Beyond Skin Tone: A Multidimensional Measure of Apparent Skin Color ICCV + + +
+ This paper strives to measure apparent skin color in computer vision, beyond +a unidimensional scale on skin tone. In their seminal paper Gender Shades, +Buolamwini and Gebru have shown how gender classification systems can be biased +against women with darker skin tones. Subsequently, fairness researchers and +practitioners have adopted the Fitzpatrick skin type classification as a common +measure to assess skin color bias in computer vision systems. While effective, +the Fitzpatrick scale only focuses on the skin tone ranging from light to dark. +Towards a more comprehensive measure of skin color, we introduce the hue angle +ranging from red to yellow. When applied to images, the hue dimension reveals +additional biases related to skin color in both computer vision datasets and +models. We then recommend multidimensional skin color scales, relying on both +skin tone and hue, for fairness assessments. + +
+
+ comment: Accepted at the International Conference on Computer Vision (ICCV) + 2023 +
+
+
+
+
+ + ☆ A Skeleton-based Approach For Rock Crack Detection Towards A Climbing + Robot Application + + +
+ Conventional wheeled robots are unable to traverse scientifically +interesting, but dangerous, cave environments. Multi-limbed climbing robot +designs, such as ReachBot, are able to grasp irregular surface features and +execute climbing motions to overcome obstacles, given suitable grasp locations. +To support grasp site identification, we present a method for detecting rock +cracks and edges, the SKeleton Intersection Loss (SKIL). SKIL is a loss +designed for thin object segmentation that leverages the skeleton of the label. +A dataset of rock face images was collected, manually annotated, and augmented +with generated data. A new group of metrics, LineAcc, has been proposed for +thin object segmentation such that the impact of the object width on the score +is minimized. In addition, the metric is less sensitive to translation which +can often lead to a score of zero when computing classical metrics such as Dice +on thin objects. Our fine-tuned models outperform previous methods on similar +thin object segmentation tasks such as blood vessel segmentation and show +promise for integration onto a robotic system. + +
+
+
+
+
+ + ☆ DAD++: Improved Data-free Test Time Adversarial Defense + + +
+ With the increasing deployment of deep neural networks in safety-critical +applications such as self-driving cars, medical imaging, anomaly detection, +etc., adversarial robustness has become a crucial concern in the reliability of +these networks in real-world scenarios. A plethora of works based on +adversarial training and regularization-based techniques have been proposed to +make these deep networks robust against adversarial attacks. However, these +methods require either retraining models or training them from scratch, making +them infeasible to defend pre-trained models when access to training data is +restricted. To address this problem, we propose a test time Data-free +Adversarial Defense (DAD) containing detection and correction frameworks. +Moreover, to further improve the efficacy of the correction framework in cases +when the detector is under-confident, we propose a soft-detection scheme +(dubbed as "DAD++"). We conduct a wide range of experiments and ablations on +several datasets and network architectures to show the efficacy of our proposed +approach. Furthermore, we demonstrate the applicability of our approach in +imparting adversarial defense at test time under data-free (or data-efficient) +applications/setups, such as Data-free Knowledge Distillation and Source-free +Unsupervised Domain Adaptation, as well as Semi-supervised classification +frameworks. We observe that in all the experiments and applications, our DAD++ +gives an impressive performance against various adversarial attacks with a +minimal drop in clean accuracy. The source code is available at: +https://github.com/vcl-iisc/Improved-Data-free-Test-Time-Adversarial-Defense + +
+
+ comment: IJCV Journal (Under Review) +
+
+
+
+
+ + ☆ 3D Implicit Transporter for Temporally Consistent Keypoint Discovery ICCV2023 + + +
+ Keypoint-based representation has proven advantageous in various visual and +robotic tasks. However, the existing 2D and 3D methods for detecting keypoints +mainly rely on geometric consistency to achieve spatial alignment, neglecting +temporal consistency. To address this issue, the Transporter method was +introduced for 2D data, which reconstructs the target frame from the source +frame to incorporate both spatial and temporal information. However, the direct +application of the Transporter to 3D point clouds is infeasible due to their +structural differences from 2D images. Thus, we propose the first 3D version of +the Transporter, which leverages hybrid 3D representation, cross attention, and +implicit reconstruction. We apply this new learning system on 3D articulated +objects and nonrigid animals (humans and rodents) and show that learned +keypoints are spatio-temporally consistent. Additionally, we propose a +closed-loop control strategy that utilizes the learned keypoints for 3D object +manipulation and demonstrate its superior performance. Codes are available at +https://github.com/zhongcl-thu/3D-Implicit-Transporter. + +
+
+ comment: ICCV2023 oral paper +
+
+
+
+
+ + ☆ MaskRenderer: 3D-Infused Multi-Mask Realistic Face Reenactment + + +
+ We present a novel end-to-end identity-agnostic face reenactment system, +MaskRenderer, that can generate realistic, high fidelity frames in real-time. +Although recent face reenactment works have shown promising results, there are +still significant challenges such as identity leakage and imitating mouth +movements, especially for large pose changes and occluded faces. MaskRenderer +tackles these problems by using (i) a 3DMM to model 3D face structure to better +handle pose changes, occlusion, and mouth movements compared to 2D +representations; (ii) a triplet loss function to embed the cross-reenactment +during training for better identity preservation; and (iii) multi-scale +occlusion, improving inpainting and restoring missing areas. Comprehensive +quantitative and qualitative experiments conducted on the VoxCeleb1 test set, +demonstrate that MaskRenderer outperforms state-of-the-art models on unseen +faces, especially when the Source and Driving identities are very different. + +
+
+
+
+
+ + ☆ Sculpting Efficiency: Pruning Medical Imaging Models for On-Device + Inference + + +
+ Applying ML advancements to healthcare can improve patient outcomes. However, +the sheer operational complexity of ML models, combined with legacy hardware +and multi-modal gigapixel images, poses a severe deployment limitation for +real-time, on-device inference. We consider filter pruning as a solution, +exploring segmentation models in cardiology and ophthalmology. Our preliminary +results show a compression rate of up to 1148x with minimal loss in quality, +stressing the need to consider task complexity and architectural details when +using off-the-shelf models. At high compression rates, filter-pruned models +exhibit faster inference on a CPU than the GPU baseline. We also demonstrate +that such models' robustness and generalisability characteristics exceed that +of the baseline and weight-pruned counterparts. We uncover intriguing questions +and take a step towards realising cost-effective disease diagnosis, monitoring, +and preventive solutions. + +
+
+
+
+
+ + ☆ FreeMan: Towards Benchmarking 3D Human Pose Estimation in the Wild + + +
+ Estimating the 3D structure of the human body from natural scenes is a +fundamental aspect of visual perception. This task carries great importance for +fields like AIGC and human-robot interaction. In practice, 3D human pose +estimation in real-world settings is a critical initial step in solving this +problem. However, the current datasets, often collected under controlled +laboratory conditions using complex motion capture equipment and unvarying +backgrounds, are insufficient. The absence of real-world datasets is stalling +the progress of this crucial task. To facilitate the development of 3D pose +estimation, we present FreeMan, the first large-scale, real-world multi-view +dataset. FreeMan was captured by synchronizing 8 smartphones across diverse +scenarios. It comprises 11M frames from 8000 sequences, viewed from different +perspectives. These sequences cover 40 subjects across 10 different scenarios, +each with varying lighting conditions. We have also established an automated, +precise labeling pipeline that allows for large-scale processing efficiently. +We provide comprehensive evaluation baselines for a range of tasks, underlining +the significant challenges posed by FreeMan. Further evaluations of standard +indoor/outdoor human sensing datasets reveal that FreeMan offers robust +representation transferability in real and complex scenes. FreeMan is now +publicly available at https://wangjiongw.github.io/freeman. + +
+
+ comment: 18 pages, 9 figures. Project page: + https://wangjiongw.github.io/freeman/; + https://github.com/wangjiongw/FreeMan_API +
+
+
+
+
+ + ☆ Super-Resolution Surface Reconstruction from Few Low-Resolution Slices + + +
+ In many imaging applications where segmented features (e.g. blood vessels) +are further used for other numerical simulations (e.g. finite element +analysis), the obtained surfaces do not have fine resolutions suitable for the +task. Increasing the resolution of such surfaces becomes crucial. This paper +proposes a new variational model for solving this problem, based on an +Euler-Elastica-based regulariser. Further, we propose and implement two +numerical algorithms for solving the model, a projected gradient descent method +and the alternating direction method of multipliers. Numerical experiments +using real-life examples (including two from outputs of another variational +model) have been illustrated for effectiveness. The advantages of the new model +are shown through quantitative comparisons by the standard deviation of +Gaussian curvatures and mean curvatures from the viewpoint of discrete +geometry. + +
+
+ comment: 33 pages, 25 figures +
+
+
+
+
+ + ☆ Exploiting CLIP for Zero-shot HOI Detection Requires Knowledge + Distillation at Multiple Levels + + +
+ In this paper, we investigate the task of zero-shot human-object interaction +(HOI) detection, a novel paradigm for identifying HOIs without the need for +task-specific annotations. To address this challenging task, we employ CLIP, a +large-scale pre-trained vision-language model (VLM), for knowledge distillation +on multiple levels. Specifically, we design a multi-branch neural network that +leverages CLIP for learning HOI representations at various levels, including +global images, local union regions encompassing human-object pairs, and +individual instances of humans or objects. To train our model, CLIP is utilized +to generate HOI scores for both global images and local union regions that +serve as supervision signals. The extensive experiments demonstrate the +effectiveness of our novel multi-level CLIP knowledge integration strategy. +Notably, the model achieves strong performance, which is even comparable with +some fully-supervised and weakly-supervised methods on the public HICO-DET +benchmark. + +
+
+
+
+
+ + ☆ Multi-view Self-supervised Disentanglement for General Image Denoising ICCV 2023 + + +
+ With its significant performance improvements, the deep learning paradigm has +become a standard tool for modern image denoisers. While promising performance +has been shown on seen noise distributions, existing approaches often suffer +from generalisation to unseen noise types or general and real noise. It is +understandable as the model is designed to learn paired mapping (e.g. from a +noisy image to its clean version). In this paper, we instead propose to learn +to disentangle the noisy image, under the intuitive assumption that different +corrupted versions of the same clean image share a common latent space. A +self-supervised learning framework is proposed to achieve the goal, without +looking at the latent clean image. By taking two different corrupted versions +of the same image as input, the proposed Multi-view Self-supervised +Disentanglement (MeD) approach learns to disentangle the latent clean features +from the corruptions and recover the clean image consequently. Extensive +experimental analysis on both synthetic and real noise shows the superiority of +the proposed method over prior self-supervised approaches, especially on unseen +novel noise types. On real noise, the proposed method even outperforms its +supervised counterparts by over 3 dB. + +
+
+ comment: International Conference on Computer Vision 2023 (ICCV 2023) +
+
+
+
+
+ + ☆ What Is Near?: Room Locality Learning for Enhanced Robot + Vision-Language-Navigation in Indoor Living Environments + + +
+ Humans use their knowledge of common house layouts obtained from previous +experiences to predict nearby rooms while navigating in new environments. This +greatly helps them navigate previously unseen environments and locate their +target room. To provide layout prior knowledge to navigational agents based on +common human living spaces, we propose WIN (\textit{W}hat \textit{I}s +\textit{N}ear), a commonsense learning model for Vision Language Navigation +(VLN) tasks. VLN requires an agent to traverse indoor environments based on +descriptive navigational instructions. Unlike existing layout learning works, +WIN predicts the local neighborhood map based on prior knowledge of living +spaces and current observation, operating on an imagined global map of the +entire environment. The model infers neighborhood regions based on visual cues +of current observations, navigational history, and layout common sense. We show +that local-global planning based on locality knowledge and predicting the +indoor layout allows the agent to efficiently select the appropriate action. +Specifically, we devised a cross-modal transformer that utilizes this locality +prior for decision-making in addition to visual inputs and instructions. +Experimental results show that locality learning using WIN provides better +generalizability compared to classical VLN agents in unseen environments. Our +model performs favorably on standard VLN metrics, with Success Rate 68\% and +Success weighted by Path Length 63\% in unseen environments. + +
+
+
+
+
+ + ☆ Unified Contrastive Fusion Transformer for Multimodal Human Action + Recognition + + +
+ Various types of sensors have been considered to develop human action +recognition (HAR) models. Robust HAR performance can be achieved by fusing +multimodal data acquired by different sensors. In this paper, we introduce a +new multimodal fusion architecture, referred to as Unified Contrastive Fusion +Transformer (UCFFormer) designed to integrate data with diverse distributions +to enhance HAR performance. Based on the embedding features extracted from each +modality, UCFFormer employs the Unified Transformer to capture the +inter-dependency among embeddings in both time and modality domains. We present +the Factorized Time-Modality Attention to perform self-attention efficiently +for the Unified Transformer. UCFFormer also incorporates contrastive learning +to reduce the discrepancy in feature distributions across various modalities, +thus generating semantically aligned features for information fusion. +Performance evaluation conducted on two popular datasets, UTD-MHAD and NTU +RGB+D, demonstrates that UCFFormer achieves state-of-the-art performance, +outperforming competing methods by considerable margins. + +
+
+
+
+
+ + ☆ SC-NeRF: Self-Correcting Neural Radiance Field with Sparse Views + + +
+ In recent studies, the generalization of neural radiance fields for novel +view synthesis task has been widely explored. However, existing methods are +limited to objects and indoor scenes. In this work, we extend the +generalization task to outdoor scenes, trained only on object-level datasets. +This approach presents two challenges. Firstly, the significant distributional +shift between training and testing scenes leads to black artifacts in rendering +results. Secondly, viewpoint changes in outdoor scenes cause ghosting or +missing regions in rendered images. To address these challenges, we propose a +geometric correction module and an appearance correction module based on +multi-head attention mechanisms. We normalize rendered depth and combine it +with light direction as query in the attention mechanism. Our network +effectively corrects varying scene structures and geometric features in outdoor +scenes, generalizing well from object-level to unseen outdoor scenes. +Additionally, we use appearance correction module to correct appearance +features, preventing rendering artifacts like blank borders and ghosting due to +viewpoint changes. By combining these modules, our approach successfully +tackles the challenges of outdoor scene generalization, producing high-quality +rendering results. When evaluated on four datasets (Blender, DTU, LLFF, +Spaces), our network outperforms previous methods. Notably, compared to +MVSNeRF, our network improves average PSNR from 19.369 to 25.989, SSIM from +0.838 to 0.889, and reduces LPIPS from 0.265 to 0.224 on Spaces outdoor scenes. + +
+
+
+
+
+ + ☆ DeViT: Decomposing Vision Transformers for Collaborative Inference in + Edge Devices + + +
+ Recent years have witnessed the great success of vision transformer (ViT), +which has achieved state-of-the-art performance on multiple computer vision +benchmarks. However, ViT models suffer from vast amounts of parameters and high +computation cost, leading to difficult deployment on resource-constrained edge +devices. Existing solutions mostly compress ViT models to a compact model but +still cannot achieve real-time inference. To tackle this issue, we propose to +explore the divisibility of transformer structure, and decompose the large ViT +into multiple small models for collaborative inference at edge devices. Our +objective is to achieve fast and energy-efficient collaborative inference while +maintaining comparable accuracy compared with large ViTs. To this end, we first +propose a collaborative inference framework termed DeViT to facilitate edge +deployment by decomposing large ViTs. Subsequently, we design a +decomposition-and-ensemble algorithm based on knowledge distillation, termed +DEKD, to fuse multiple small decomposed models while dramatically reducing +communication overheads, and handle heterogeneous models by developing a +feature matching module to promote the imitations of decomposed models from the +large ViT. Extensive experiments for three representative ViT backbones on four +widely-used datasets demonstrate our method achieves efficient collaborative +inference for ViTs and outperforms existing lightweight ViTs, striking a good +trade-off between efficiency and accuracy. For example, our DeViTs improves +end-to-end latency by 2.89$\times$ with only 1.65% accuracy sacrifice using +CIFAR-100 compared to the large ViT, ViT-L/16, on the GPU server. DeDeiTs +surpasses the recent efficient ViT, MobileViT-S, by 3.54% in accuracy on +ImageNet-1K, while running 1.72$\times$ faster and requiring 55.28% lower +energy consumption on the edge device. + +
+
+ comment: Accepted by IEEE Transactions on Mobile Computing +
+
+
+
+
+ + ☆ Geometrically Consistent Partial Shape Matching + + +
+ Finding correspondences between 3D shapes is a crucial problem in computer +vision and graphics, which is for example relevant for tasks like shape +interpolation, pose transfer, or texture transfer. An often neglected but +essential property of matchings is geometric consistency, which means that +neighboring triangles in one shape are consistently matched to neighboring +triangles in the other shape. Moreover, while in practice one often has only +access to partial observations of a 3D shape (e.g. due to occlusion, or +scanning artifacts), there do not exist any methods that directly address +geometrically consistent partial shape matching. In this work we fill this gap +by proposing to integrate state-of-the-art deep shape features into a novel +integer linear programming partial shape matching formulation. Our optimization +yields a globally optimal solution on low resolution shapes, which we then +refine using a coarse-to-fine scheme. We show that our method can find more +reliable results on partial shapes in comparison to existing geometrically +consistent algorithms (for which one first has to fill missing parts with a +dummy geometry). Moreover, our matchings are substantially smoother than +learning-based state-of-the-art shape matching methods. + +
+
+
+
+
+ + ☆ Towards Fully Decoupled End-to-End Person Search + + +
+ End-to-end person search aims to jointly detect and re-identify a target +person in raw scene images with a unified model. The detection task unifies all +persons while the re-id task discriminates different identities, resulting in +conflict optimal objectives. Existing works proposed to decouple end-to-end +person search to alleviate such conflict. Yet these methods are still +sub-optimal on one or two of the sub-tasks due to their partially decoupled +models, which limits the overall person search performance. In this paper, we +propose to fully decouple person search towards optimal person search. A +task-incremental person search network is proposed to incrementally construct +an end-to-end model for the detection and re-id sub-task, which decouples the +model architecture for the two sub-tasks. The proposed task-incremental network +allows task-incremental training for the two conflicting tasks. This enables +independent learning for different objectives thus fully decoupled the model +for persons earch. Comprehensive experimental evaluations demonstrate the +effectiveness of the proposed fully decoupled models for end-to-end person +search. + +
+
+ comment: DICTA 2023 +
+
+
+
+
+ + ☆ Prefix-diffusion: A Lightweight Diffusion Model for Diverse Image + Captioning + + +
+ While impressive performance has been achieved in image captioning, the +limited diversity of the generated captions and the large parameter scale +remain major barriers to the real-word application of these systems. In this +work, we propose a lightweight image captioning network in combination with +continuous diffusion, called Prefix-diffusion. To achieve diversity, we design +an efficient method that injects prefix image embeddings into the denoising +process of the diffusion model. In order to reduce trainable parameters, we +employ a pre-trained model to extract image features and further design an +extra mapping network. Prefix-diffusion is able to generate diverse captions +with relatively less parameters, while maintaining the fluency and relevance of +the captions benefiting from the generative capabilities of the diffusion +model. Our work paves the way for scaling up diffusion models for image +captioning, and achieves promising performance compared with recent approaches. + +
+
+ comment: 11 pages,4 figures, 6 tables +
+
+
+
+
+ + ☆ Multi-modal Extreme Classification + + +
+ This paper develops the MUFIN technique for extreme classification (XC) tasks +with millions of labels where datapoints and labels are endowed with visual and +textual descriptors. Applications of MUFIN to product-to-product recommendation +and bid query prediction over several millions of products are presented. +Contemporary multi-modal methods frequently rely on purely embedding-based +methods. On the other hand, XC methods utilize classifier architectures to +offer superior accuracies than embedding only methods but mostly focus on +text-based categorization tasks. MUFIN bridges this gap by reformulating +multi-modal categorization as an XC problem with several millions of labels. +This presents the twin challenges of developing multi-modal architectures that +can offer embeddings sufficiently expressive to allow accurate categorization +over millions of labels; and training and inference routines that scale +logarithmically in the number of labels. MUFIN develops an architecture based +on cross-modal attention and trains it in a modular fashion using pre-training +and positive and negative mining. A novel product-to-product recommendation +dataset MM-AmazonTitles-300K containing over 300K products was curated from +publicly available amazon.com listings with each product endowed with a title +and multiple images. On the all datasets MUFIN offered at least 3% higher +accuracy than leading text-based, image-based and multi-modal techniques. Code +for MUFIN is available at https://github.com/Extreme-classification/MUFIN + +
+
+
+
+
+ + ☆ SdCT-GAN: Reconstructing CT from Biplanar X-Rays with Self-driven + Generative Adversarial Networks + + +
+ Computed Tomography (CT) is a medical imaging modality that can generate more +informative 3D images than 2D X-rays. However, this advantage comes at the +expense of more radiation exposure, higher costs, and longer acquisition time. +Hence, the reconstruction of 3D CT images using a limited number of 2D X-rays +has gained significant importance as an economical alternative. Nevertheless, +existing methods primarily prioritize minimizing pixel/voxel-level intensity +discrepancies, often neglecting the preservation of textural details in the +synthesized images. This oversight directly impacts the quality of the +reconstructed images and thus affects the clinical diagnosis. To address the +deficits, this paper presents a new self-driven generative adversarial network +model (SdCT-GAN), which is motivated to pay more attention to image details by +introducing a novel auto-encoder structure in the discriminator. In addition, a +Sobel Gradient Guider (SGG) idea is applied throughout the model, where the +edge information from the 2D X-ray image at the input can be integrated. +Moreover, LPIPS (Learned Perceptual Image Patch Similarity) evaluation metric +is adopted that can quantitatively evaluate the fine contours and textures of +reconstructed images better than the existing ones. Finally, the qualitative +and quantitative results of the empirical studies justify the power of the +proposed model compared to mainstream state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Semi-Supervised learning for Face Anti-Spoofing using Apex frame + + +
+ Conventional feature extraction techniques in the face anti-spoofing domain +either analyze the entire video sequence or focus on a specific segment to +improve model performance. However, identifying the optimal frames that provide +the most valuable input for the face anti-spoofing remains a challenging task. +In this paper, we address this challenge by employing Gaussian weighting to +create apex frames for videos. Specifically, an apex frame is derived from a +video by computing a weighted sum of its frames, where the weights are +determined using a Gaussian distribution centered around the video's central +frame. Furthermore, we explore various temporal lengths to produce multiple +unlabeled apex frames using a Gaussian function, without the need for +convolution. By doing so, we leverage the benefits of semi-supervised learning, +which considers both labeled and unlabeled apex frames to effectively +discriminate between live and spoof classes. Our key contribution emphasizes +the apex frame's capacity to represent the most significant moments in the +video, while unlabeled apex frames facilitate efficient semi-supervised +learning, as they enable the model to learn from videos of varying temporal +lengths. Experimental results using four face anti-spoofing databases: CASIA, +REPLAY-ATTACK, OULU-NPU, and MSU-MFSD demonstrate the apex frame's efficacy in +advancing face anti-spoofing techniques. + +
+
+
+
+
+ + ☆ Anatomy Completor: A Multi-class Completion Framework for 3D Anatomy + Reconstruction + + +
+ In this paper, we introduce a completion framework to reconstruct the +geometric shapes of various anatomies, including organs, vessels and muscles. +Our work targets a scenario where one or multiple anatomies are missing in the +imaging data due to surgical, pathological or traumatic factors, or simply +because these anatomies are not covered by image acquisition. Automatic +reconstruction of the missing anatomies benefits many applications, such as +organ 3D bio-printing, whole-body segmentation, animation realism, +paleoradiology and forensic imaging. We propose two paradigms based on a 3D +denoising auto-encoder (DAE) to solve the anatomy reconstruction problem: (i) +the DAE learns a many-to-one mapping between incomplete and complete instances; +(ii) the DAE learns directly a one-to-one residual mapping between the +incomplete instances and the target anatomies. We apply a loss aggregation +scheme that enables the DAE to learn the many-to-one mapping more effectively +and further enhances the learning of the residual mapping. On top of this, we +extend the DAE to a multiclass completor by assigning a unique label to each +anatomy involved. We evaluate our method using a CT dataset with whole-body +segmentations. Results show that our method produces reasonable anatomy +reconstructions given instances with different levels of incompleteness (i.e., +one or multiple random anatomies are missing). Codes and pretrained models are +publicly available at https://github.com/Jianningli/medshapenet-feedback/ +tree/main/anatomy-completor + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ Efficient Emotional Adaptation for Audio-Driven Talking-Head Generation ICCV 2023 + + +
+ Audio-driven talking-head synthesis is a popular research topic for virtual +human-related applications. However, the inflexibility and inefficiency of +existing methods, which necessitate expensive end-to-end training to transfer +emotions from guidance videos to talking-head predictions, are significant +limitations. In this work, we propose the Emotional Adaptation for Audio-driven +Talking-head (EAT) method, which transforms emotion-agnostic talking-head +models into emotion-controllable ones in a cost-effective and efficient manner +through parameter-efficient adaptations. Our approach utilizes a pretrained +emotion-agnostic talking-head transformer and introduces three lightweight +adaptations (the Deep Emotional Prompts, Emotional Deformation Network, and +Emotional Adaptation Module) from different perspectives to enable precise and +realistic emotion controls. Our experiments demonstrate that our approach +achieves state-of-the-art performance on widely-used benchmarks, including LRW +and MEAD. Additionally, our parameter-efficient adaptations exhibit remarkable +generalization ability, even in scenarios where emotional training videos are +scarce or nonexistent. Project website: https://yuangan.github.io/eat/ + +
+
+ comment: Accepted to ICCV 2023. Project page: https://yuangan.github.io/eat/ +
+
+
+
+
+ + ☆ Text-driven Editing of 3D Scenes without Retraining + + +
+ Numerous diffusion models have recently been applied to image synthesis and +editing. However, editing 3D scenes is still in its early stages. It poses +various challenges, such as the requirement to design specific methods for +different editing types, retraining new models for various 3D scenes, and the +absence of convenient human interaction during editing. To tackle these issues, +we introduce a text-driven editing method, termed DN2N, which allows for the +direct acquisition of a NeRF model with universal editing capabilities, +eliminating the requirement for retraining. Our method employs off-the-shelf +text-based editing models of 2D images to modify the 3D scene images, followed +by a filtering process to discard poorly edited images that disrupt 3D +consistency. We then consider the remaining inconsistency as a problem of +removing noise perturbation, which can be solved by generating training data +with similar perturbation characteristics for training. We further propose +cross-view regularization terms to help the generalized NeRF model mitigate +these perturbations. Our text-driven method allows users to edit a 3D scene +with their desired description, which is more friendly, intuitive, and +practical than prior works. Empirical results show that our method achieves +multiple editing types, including but not limited to appearance editing, +weather transition, material changing, and style transfer. Most importantly, +our method generalizes well with editing abilities shared among a set of model +parameters without requiring a customized editing model for some specific +scenes, thus inferring novel views with editing effects directly from user +input. The project website is available at http://sk-fun.fun/DN2N + +
+
+ comment: Project Website: http://sk-fun.fun/DN2N +
+
+
+
+
+ + ☆ MFPNet: Multi-scale Feature Propagation Nwtwork For Lightweight Semantic + Segmentation + + +
+ In contrast to the abundant research focusing on large-scale models, the +progress in lightweight semantic segmentation appears to be advancing at a +comparatively slower pace. However, existing compact methods often suffer from +limited feature representation capability due to the shallowness of their +networks. In this paper, we propose a novel lightweight segmentation +architecture, called Multi-scale Feature Propagation Network (MFPNet), to +address the dilemma. Specifically, we design a robust Encoder-Decoder structure +featuring symmetrical residual blocks that consist of flexible bottleneck +residual modules (BRMs) to explore deep and rich muti-scale semantic context. +Furthermore, taking benefit from their capacity to model latent long-range +contextual relationships, we leverage Graph Convolutional Networks (GCNs) to +facilitate multi-scale feature propagation between the BRM blocks. When +evaluated on benchmark datasets, our proposed approach shows superior +segmentation results. + +
+
+ comment: 5 pages, 3 figures, 5tables, conference +
+
+
+
+
+ + ☆ Effective Real Image Editing with Accelerated Iterative Diffusion + Inversion ICCV 2023 + + +
+ Despite all recent progress, it is still challenging to edit and manipulate +natural images with modern generative models. When using Generative Adversarial +Network (GAN), one major hurdle is in the inversion process mapping a real +image to its corresponding noise vector in the latent space, since its +necessary to be able to reconstruct an image to edit its contents. Likewise for +Denoising Diffusion Implicit Models (DDIM), the linearization assumption in +each inversion step makes the whole deterministic inversion process unreliable. +Existing approaches that have tackled the problem of inversion stability often +incur in significant trade-offs in computational efficiency. In this work we +propose an Accelerated Iterative Diffusion Inversion method, dubbed AIDI, that +significantly improves reconstruction accuracy with minimal additional overhead +in space and time complexity. By using a novel blended guidance technique, we +show that effective results can be obtained on a large range of image editing +tasks without large classifier-free guidance in inversion. Furthermore, when +compared with other diffusion inversion based works, our proposed process is +shown to be more robust for fast image editing in the 10 and 20 diffusion +steps' regimes. + +
+
+ comment: Accepted to ICCV 2023 (Oral) +
+
+
+
+
+ + ☆ Transformers in Small Object Detection: A Benchmark and Survey of + State-of-the-Art + + +
+ Transformers have rapidly gained popularity in computer vision, especially in +the field of object recognition and detection. Upon examining the outcomes of +state-of-the-art object detection methods, we noticed that transformers +consistently outperformed well-established CNN-based detectors in almost every +video or image dataset. While transformer-based approaches remain at the +forefront of small object detection (SOD) techniques, this paper aims to +explore the performance benefits offered by such extensive networks and +identify potential reasons for their SOD superiority. Small objects have been +identified as one of the most challenging object types in detection frameworks +due to their low visibility. We aim to investigate potential strategies that +could enhance transformers' performance in SOD. This survey presents a taxonomy +of over 60 research studies on developed transformers for the task of SOD, +spanning the years 2020 to 2023. These studies encompass a variety of detection +applications, including small object detection in generic images, aerial +images, medical images, active millimeter images, underwater images, and +videos. We also compile and present a list of 12 large-scale datasets suitable +for SOD that were overlooked in previous studies and compare the performance of +the reviewed studies using popular metrics such as mean Average Precision +(mAP), Frames Per Second (FPS), number of parameters, and more. Researchers can +keep track of newer studies on our web page, which is available at +\url{https://github.com/arekavandi/Transformer-SOD}. + +
+
+
+
+
+ + ♻ ☆ Practical Blind Image Denoising via Swin-Conv-UNet and Data Synthesis SC + + +
+ While recent years have witnessed a dramatic upsurge of exploiting deep +neural networks toward solving image denoising, existing methods mostly rely on +simple noise assumptions, such as additive white Gaussian noise (AWGN), JPEG +compression noise and camera sensor noise, and a general-purpose blind +denoising method for real images remains unsolved. In this paper, we attempt to +solve this problem from the perspective of network architecture design and +training data synthesis. Specifically, for the network architecture design, we +propose a swin-conv block to incorporate the local modeling ability of residual +convolutional layer and non-local modeling ability of swin transformer block, +and then plug it as the main building block into the widely-used image-to-image +translation UNet architecture. For the training data synthesis, we design a +practical noise degradation model which takes into consideration different +kinds of noise (including Gaussian, Poisson, speckle, JPEG compression, and +processed camera sensor noises) and resizing, and also involves a random +shuffle strategy and a double degradation strategy. Extensive experiments on +AGWN removal and real image denoising demonstrate that the new network +architecture design achieves state-of-the-art performance and the new +degradation model can help to significantly improve the practicability. We +believe our work can provide useful insights into current denoising research. + +
+
+ comment: Codes: https://github.com/cszn/SCUNet +
+
+
+
+
+ + ♻ ☆ Self-supervised contrastive learning of echocardiogram videos enables + label-efficient cardiac disease diagnosis + + +
+ Advances in self-supervised learning (SSL) have shown that self-supervised +pretraining on medical imaging data can provide a strong initialization for +downstream supervised classification and segmentation. Given the difficulty of +obtaining expert labels for medical image recognition tasks, such an +"in-domain" SSL initialization is often desirable due to its improved label +efficiency over standard transfer learning. However, most efforts toward SSL of +medical imaging data are not adapted to video-based medical imaging modalities. +With this progress in mind, we developed a self-supervised contrastive learning +approach, EchoCLR, catered to echocardiogram videos with the goal of learning +strong representations for efficient fine-tuning on downstream cardiac disease +diagnosis. EchoCLR leverages (i) distinct videos of the same patient as +positive pairs for contrastive learning and (ii) a frame re-ordering pretext +task to enforce temporal coherence. When fine-tuned on small portions of +labeled data (as few as 51 exams), EchoCLR pretraining significantly improved +classification performance for left ventricular hypertrophy (LVH) and aortic +stenosis (AS) over other transfer learning and SSL approaches across internal +and external test sets. For example, when fine-tuning on 10% of available +training data (519 studies), an EchoCLR-pretrained model achieved 0.72 AUROC +(95% CI: [0.69, 0.75]) on LVH classification, compared to 0.61 AUROC (95% CI: +[0.57, 0.64]) with a standard transfer learning approach. Similarly, using 1% +of available training data (53 studies), EchoCLR pretraining achieved 0.82 +AUROC (95% CI: [0.79, 0.84]) on severe AS classification, compared to 0.61 +AUROC (95% CI: [0.58, 0.65]) with transfer learning. EchoCLR is unique in its +ability to learn representations of medical videos and demonstrates that SSL +can enable label-efficient disease classification from small, labeled datasets. + +
+
+
+
+
+ + ♻ ☆ Segment anything, from space? + + +
+ Recently, the first foundation model developed specifically for image +segmentation tasks was developed, termed the "Segment Anything Model" (SAM). +SAM can segment objects in input imagery based on cheap input prompts, such as +one (or more) points, a bounding box, or a mask. The authors examined the +\textit{zero-shot} image segmentation accuracy of SAM on a large number of +vision benchmark tasks and found that SAM usually achieved recognition accuracy +similar to, or sometimes exceeding, vision models that had been trained on the +target tasks. The impressive generalization of SAM for segmentation has major +implications for vision researchers working on natural imagery. In this work, +we examine whether SAM's performance extends to overhead imagery problems and +help guide the community's response to its development. We examine SAM's +performance on a set of diverse and widely studied benchmark tasks. We find +that SAM does often generalize well to overhead imagery, although it fails in +some cases due to the unique characteristics of overhead imagery and its common +target objects. We report on these unique systematic failure cases for remote +sensing imagery that may comprise useful future research for the community. + +
+
+
+
+
+ + ♻ ☆ Discriminative Class Tokens for Text-to-Image Diffusion Models ICCV 2023 + + +
+ Recent advances in text-to-image diffusion models have enabled the generation +of diverse and high-quality images. While impressive, the images often fall +short of depicting subtle details and are susceptible to errors due to +ambiguity in the input text. One way of alleviating these issues is to train +diffusion models on class-labeled datasets. This approach has two +disadvantages: (i) supervised datasets are generally small compared to +large-scale scraped text-image datasets on which text-to-image models are +trained, affecting the quality and diversity of the generated images, or (ii) +the input is a hard-coded label, as opposed to free-form text, limiting the +control over the generated images. + In this work, we propose a non-invasive fine-tuning technique that +capitalizes on the expressive potential of free-form text while achieving high +accuracy through discriminative signals from a pretrained classifier. This is +done by iteratively modifying the embedding of an added input token of a +text-to-image diffusion model, by steering generated images toward a given +target class according to a classifier. Our method is fast compared to prior +fine-tuning methods and does not require a collection of in-class images or +retraining of a noise-tolerant classifier. We evaluate our method extensively, +showing that the generated images are: (i) more accurate and of higher quality +than standard diffusion models, (ii) can be used to augment training data in a +low-resource setting, and (iii) reveal information about the data used to train +the guiding classifier. The code is available at +\url{https://github.com/idansc/discriminative_class_tokens}. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Comparative Analysis of Deep Learning Architectures for Breast Cancer + Diagnosis Using the BreaKHis Dataset + + +
+ Cancer is an extremely difficult and dangerous health problem because it +manifests in so many different ways and affects so many different organs and +tissues. The primary goal of this research was to evaluate deep learning +models' ability to correctly identify breast cancer cases using the BreakHis +dataset. The BreakHis dataset covers a wide range of breast cancer subtypes +through its huge collection of histopathological pictures. In this study, we +use and compare the performance of five well-known deep learning models for +cancer classification: VGG, ResNet, Xception, Inception, and InceptionResNet. +The results placed the Xception model at the top, with an F1 score of 0.9 and +an accuracy of 89%. At the same time, the Inception and InceptionResNet models +both hit accuracy of 87% . However, the F1 score for the Inception model was +87, while that for the InceptionResNet model was 86. These results demonstrate +the importance of deep learning methods in making correct breast cancer +diagnoses. This highlights the potential to provide improved diagnostic +services to patients. The findings of this study not only improve current +methods of cancer diagnosis, but also make significant contributions to the +creation of new and improved cancer treatment strategies. In a nutshell, the +results of this study represent a major advancement in the direction of +achieving these vital healthcare goals. + +
+
+ comment: 7 pages, 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ Sensors and Systems for Monitoring Mental Fatigue: A systematic review + + +
+ Mental fatigue is a leading cause of motor vehicle accidents, medical errors, +loss of workplace productivity, and student disengagements in e-learning +environment. Development of sensors and systems that can reliably track mental +fatigue can prevent accidents, reduce errors, and help increase workplace +productivity. This review provides a critical summary of theoretical models of +mental fatigue, a description of key enabling sensor technologies, and a +systematic review of recent studies using biosensor-based systems for tracking +mental fatigue in humans. We conducted a systematic search and review of recent +literature which focused on detection and tracking of mental fatigue in humans. +The search yielded 57 studies (N=1082), majority of which used +electroencephalography (EEG) based sensors for tracking mental fatigue. We +found that EEG-based sensors can provide a moderate to good sensitivity for +fatigue detection. Notably, we found no incremental benefit of using +high-density EEG sensors for application in mental fatigue detection. Given the +findings, we provide a critical discussion on the integration of wearable EEG +and ambient sensors in the context of achieving real-world monitoring. Future +work required to advance and adapt the technologies toward widespread +deployment of wearable sensors and systems for fatigue monitoring in +semi-autonomous and autonomous industries is examined. + +
+
+ comment: 19 Pages, 3 Figures +
+
+
+
+
+ + ♻ ☆ Text2Room: Extracting Textured 3D Meshes from 2D Text-to-Image Models ICCV 2023 + + +
+ We present Text2Room, a method for generating room-scale textured 3D meshes +from a given text prompt as input. To this end, we leverage pre-trained 2D +text-to-image models to synthesize a sequence of images from different poses. +In order to lift these outputs into a consistent 3D scene representation, we +combine monocular depth estimation with a text-conditioned inpainting model. +The core idea of our approach is a tailored viewpoint selection such that the +content of each image can be fused into a seamless, textured 3D mesh. More +specifically, we propose a continuous alignment strategy that iteratively fuses +scene frames with the existing geometry to create a seamless mesh. Unlike +existing works that focus on generating single objects or zoom-out trajectories +from text, our method generates complete 3D scenes with multiple objects and +explicit 3D geometry. We evaluate our approach using qualitative and +quantitative metrics, demonstrating it as the first method to generate +room-scale 3D geometry with compelling textures from only text as input. + +
+
+ comment: Accepted to ICCV 2023 (Oral) video: https://youtu.be/fjRnFL91EZc + project page: https://lukashoel.github.io/text-to-room/ code: + https://github.com/lukasHoel/text2room +
+
+
+
+
+ + ♻ ☆ E-CLIP: Towards Label-efficient Event-based Open-world Understanding by + CLIP + + +
+ Contrasting Language-image pertaining (CLIP) has recently shown promising +open-world and few-shot performance on 2D image-based recognition tasks. +However, the transferred capability of CLIP to the novel event camera data +still remains under-explored. In particular, due to the modality gap with the +image-text data and the lack of large-scale datasets, achieving this goal is +non-trivial and thus requires significant research innovation. In this paper, +we propose E-CLIP, a novel and effective framework that unleashes the potential +of CLIP for event-based recognition to compensate for the lack of large-scale +event-based datasets. Our work addresses two crucial challenges: 1) how to +generalize CLIP's visual encoder to event data while fully leveraging events' +unique properties, e.g., sparsity and high temporal resolution; 2) how to +effectively align the multi-modal embeddings, i.e., image, text, and events. To +this end, we first introduce a novel event encoder that subtly models the +temporal information from events and meanwhile generates event prompts to +promote the modality bridging. We then design a text encoder that generates +content prompts and utilizes hybrid text prompts to enhance the E-CLIP's +generalization ability across diverse datasets. With the proposed event +encoder, text encoder, and original image encoder, a novel Hierarchical Triple +Contrastive Alignment (HTCA) module is introduced to jointly optimize the +correlation and enable efficient knowledge transfer among the three modalities. +We conduct extensive experiments on two recognition benchmarks, and the results +demonstrate that our E-CLIP outperforms existing methods by a large margin of ++3.94% and +4.62% on the N-Caltech dataset, respectively, in both fine-tuning +and few-shot settings. Moreover, our E-CLIP can be flexibly extended to the +event retrieval task using both text or image queries, showing plausible +performance. + +
+
+ comment: Jounal version with supplementary material +
+
+
+
+
+ + ♻ ☆ Cross-Dimensional Refined Learning for Real-Time 3D Visual Perception + from Monocular Video ICCV 2023 + + +
+ We present a novel real-time capable learning method that jointly perceives a +3D scene's geometry structure and semantic labels. Recent approaches to +real-time 3D scene reconstruction mostly adopt a volumetric scheme, where a +Truncated Signed Distance Function (TSDF) is directly regressed. However, these +volumetric approaches tend to focus on the global coherence of their +reconstructions, which leads to a lack of local geometric detail. To overcome +this issue, we propose to leverage the latent geometric prior knowledge in 2D +image features by explicit depth prediction and anchored feature generation, to +refine the occupancy learning in TSDF volume. Besides, we find that this +cross-dimensional feature refinement methodology can also be adopted for the +semantic segmentation task by utilizing semantic priors. Hence, we proposed an +end-to-end cross-dimensional refinement neural network (CDRNet) to extract both +3D mesh and 3D semantic labeling in real time. The experiment results show that +this method achieves a state-of-the-art 3D perception efficiency on multiple +datasets, which indicates the great potential of our method for industrial +applications. + +
+
+ comment: Accpeted to ICCV 2023 Workshops. Project page: + https://hafred.github.io/cdrnet/ +
+
+
+
+
+ + ♻ ☆ A Simple And Effective Filtering Scheme For Improving Neural Fields + + +
+ Recently, neural fields, also known as coordinate-based MLPs, have achieved +impressive results in representing low-dimensional data. Unlike CNN, MLPs are +globally connected and lack local control; adjusting a local region leads to +global changes. Therefore, improving local neural fields usually leads to a +dilemma: filtering out local artifacts can simultaneously smooth away desired +details. Our solution is a new filtering technique that consists of two +counteractive operators: a smoothing operator that provides global smoothing +for better generalization, and conversely a recovering operator that provides +better controllability for local adjustments. We have found that using either +operator alone can lead to an increase in noisy artifacts or oversmoothed +regions. By combining the two operators, smoothing and sharpening can be +adjusted to first smooth the entire region and then recover fine-grained +details in regions overly smoothed. In this way, our filter helps neural fields +remove much noise while enhancing details. We demonstrate the benefits of our +filter on various tasks and show significant improvements over state-of-the-art +methods. Moreover, our filter also provides better performance in terms of +convergence speed and network stability. + +
+
+ comment: Accepted to Computational Visual Media +
+
+
+
+
+ + ♻ ☆ Self-Reference Deep Adaptive Curve Estimation for Low-Light Image + Enhancement + + +
+ In this paper, we propose a 2-stage low-light image enhancement method called +Self-Reference Deep Adaptive Curve Estimation (Self-DACE). In the first stage, +we present an intuitive, lightweight, fast, and unsupervised luminance +enhancement algorithm. The algorithm is based on a novel low-light enhancement +curve that can be used to locally boost image brightness. We also propose a new +loss function with a simplified physical model designed to preserve natural +images' color, structure, and fidelity. We use a vanilla CNN to map each pixel +through deep Adaptive Adjustment Curves (AAC) while preserving the local image +structure. Secondly, we introduce the corresponding denoising scheme to remove +the latent noise in the darkness. We approximately model the noise in the dark +and deploy a Denoising-Net to estimate and remove the noise after the first +stage. Exhaustive qualitative and quantitative analysis shows that our method +outperforms existing state-of-the-art algorithms on multiple real-world +datasets. + +
+
+
+
+
+ + ♻ ☆ Video Action Recognition Collaborative Learning with Dynamics via + PSO-ConvNet Transformer + + +
+ Recognizing human actions in video sequences, known as Human Action +Recognition (HAR), is a challenging task in pattern recognition. While +Convolutional Neural Networks (ConvNets) have shown remarkable success in image +recognition, they are not always directly applicable to HAR, as temporal +features are critical for accurate classification. In this paper, we propose a +novel dynamic PSO-ConvNet model for learning actions in videos, building on our +recent work in image recognition. Our approach leverages a framework where the +weight vector of each neural network represents the position of a particle in +phase space, and particles share their current weight vectors and gradient +estimates of the Loss function. To extend our approach to video, we integrate +ConvNets with state-of-the-art temporal methods such as Transformer and +Recurrent Neural Networks. Our experimental results on the UCF-101 dataset +demonstrate substantial improvements of up to 9% in accuracy, which confirms +the effectiveness of our proposed method. In addition, we conducted experiments +on larger and more variety of datasets including Kinetics-400 and HMDB-51 and +obtained preference for Collaborative Learning in comparison with +Non-Collaborative Learning (Individual Learning). Overall, our dynamic +PSO-ConvNet model provides a promising direction for improving HAR by better +capturing the spatio-temporal dynamics of human actions in videos. The code is +available at +https://github.com/leonlha/Video-Action-Recognition-Collaborative-Learning-with-Dynamics-via-PSO-ConvNet-Transformer. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Batch-based Model Registration for Fast 3D Sherd Reconstruction + + +
+ 3D reconstruction techniques have widely been used for digital documentation +of archaeological fragments. However, efficient digital capture of fragments +remains as a challenge. In this work, we aim to develop a portable, +high-throughput, and accurate reconstruction system for efficient digitization +of fragments excavated in archaeological sites. To realize high-throughput +digitization of large numbers of objects, an effective strategy is to perform +scanning and reconstruction in batches. However, effective batch-based scanning +and reconstruction face two key challenges: 1) how to correlate partial scans +of the same object from multiple batch scans, and 2) how to register and +reconstruct complete models from partial scans that exhibit only small +overlaps. To tackle these two challenges, we develop a new batch-based matching +algorithm that pairs the front and back sides of the fragments, and a new +Bilateral Boundary ICP algorithm that can register partial scans sharing very +narrow overlapping regions. Extensive validation in labs and testing in +excavation sites demonstrate that these designs enable efficient batch-based +scanning for fragments. We show that such a batch-based scanning and +reconstruction pipeline can have immediate applications on digitizing sherds in +archaeological excavations. Our project page: +https://jiepengwang.github.io/FIRES/. + +
+
+ comment: Project page: https://jiepengwang.github.io/FIRES/ +
+
+
+
+
+ + ♻ ☆ Perceptual Video Coding for Machines via Satisfied Machine Ratio + Modeling + + +
+ Video Coding for Machines (VCM) aims to compress visual signals for machine +analysis. However, existing methods only consider a few machines, neglecting +the majority. Moreover, the machine perceptual characteristics are not +effectively leveraged, leading to suboptimal compression efficiency. In this +paper, we introduce Satisfied Machine Ratio (SMR) to address these issues. SMR +statistically measures the quality of compressed images and videos for machines +by aggregating satisfaction scores from them. Each score is calculated based on +the difference in machine perceptions between original and compressed images. +Targeting image classification and object detection tasks, we build two +representative machine libraries for SMR annotation and construct a large-scale +SMR dataset to facilitate SMR studies. We then propose an SMR prediction model +based on the correlation between deep features differences and SMR. +Furthermore, we introduce an auxiliary task to increase the prediction accuracy +by predicting the SMR difference between two images in different quality +levels. Extensive experiments demonstrate that using the SMR models +significantly improves compression performance for VCM, and the SMR models +generalize well to unseen machines, traditional and neural codecs, and +datasets. In summary, SMR enables perceptual coding for machines and advances +VCM from specificity to generality. Code is available at +\url{https://github.com/ywwynm/SMR}. + +
+
+
+
+
+ + ♻ ☆ JNMR: Joint Non-linear Motion Regression for Video Frame Interpolation + + +
+ Video frame interpolation (VFI) aims to generate predictive frames by warping +learnable motions from the bidirectional historical references. Most existing +works utilize spatio-temporal semantic information extractor to realize motion +estimation and interpolation modeling. However, they insufficiently consider +the real mechanistic rationality of generated middle motions. In this paper, we +reformulate VFI as a Joint Non-linear Motion Regression (JNMR) strategy to +model the complicated motions of inter-frame. Specifically, the motion +trajectory between the target frame and the multiple reference frames is +regressed by a temporal concatenation of multi-stage quadratic models. ConvLSTM +is adopted to construct this joint distribution of complete motions in temporal +dimension. Moreover, the feature learning network is designed to optimize for +the joint regression modeling. A coarse-to-fine synthesis enhancement module is +also conducted to learn visual dynamics at different resolutions through +repetitive regression and interpolation. Experimental results on VFI show that +the effectiveness and significant improvement of joint motion regression +compared with the state-of-the-art methods. The code is available at +https://github.com/ruhig6/JNMR. + +
+
+ comment: Accepted by IEEE Transactions on Image Processing (TIP) +
+
+
+
+
+ + ♻ ☆ Estimating the Power Consumption of Heterogeneous Devices when + performing AI Inference + + +
+ Modern-day life is driven by electronic devices connected to the internet. +The emerging research field of the Internet-of-Things (IoT) has become popular, +just as there has been a steady increase in the number of connected devices. +Since many of these devices are utilised to perform CV tasks, it is essential +to understand their power consumption against performance. We report the power +consumption profile and analysis of the NVIDIA Jetson Nano board while +performing object classification. The authors present an extensive analysis +regarding power consumption per frame and the output in frames per second using +YOLOv5 models. The results show that the YOLOv5n outperforms other YOLOV5 +variants in terms of throughput (i.e. 12.34 fps) and low power consumption +(i.e. 0.154 mWh/frame). + +
+
+
+
+
+
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Learning Personalized User Preference from Cold Start in Multi-turn + Conversations + + +
+ This paper presents a novel teachable conversation interaction system that is +capable of learning users preferences from cold start by gradually adapting to +personal preferences. In particular, the TAI system is able to automatically +identify and label user preference in live interactions, manage dialogue flows +for interactive teaching sessions, and reuse learned preference for preference +elicitation. We develop the TAI system by leveraging BERT encoder models to +encode both dialogue and relevant context information, and build action +prediction (AP), argument filling (AF) and named entity recognition (NER) +models to understand the teaching session. We adopt a seeker-provider +interaction loop mechanism to generate diverse dialogues from cold-start. TAI +is capable of learning user preference, which achieves 0.9122 turn level +accuracy on out-of-sample dataset, and has been successfully adopted in +production. + +
+
+ comment: preference, personalization, cold-start, dialogue, LLM. embedding +
+
+
+
+
+ + ☆ Personalized Search Via Neural Contextual Semantic Relevance Ranking + + +
+ Existing neural relevance models do not give enough consideration for query +and item context information which diversifies the search results to adapt for +personal preference. To bridge this gap, this paper presents a neural learning +framework to personalize document ranking results by leveraging the signals to +capture how the document fits into users' context. In particular, it models the +relationships between document content and user query context using both +lexical representations and semantic embeddings such that the user's intent can +be better understood by data enrichment of personalized query context +information. Extensive experiments performed on the search dataset, demonstrate +the effectiveness of the proposed method. + +
+
+ comment: Contextual, Personalization, Search, Semantics, LLM, embedding +
+
+
+
+
+ + ☆ Duplicate Question Retrieval and Confirmation Time Prediction in + Software Communities + + +
+ Community Question Answering (CQA) in different domains is growing at a large +scale because of the availability of several platforms and huge shareable +information among users. With the rapid growth of such online platforms, a +massive amount of archived data makes it difficult for moderators to retrieve +possible duplicates for a new question and identify and confirm existing +question pairs as duplicates at the right time. This problem is even more +critical in CQAs corresponding to large software systems like askubuntu where +moderators need to be experts to comprehend something as a duplicate. Note that +the prime challenge in such CQA platforms is that the moderators are themselves +experts and are therefore usually extremely busy with their time being +extraordinarily expensive. To facilitate the task of the moderators, in this +work, we have tackled two significant issues for the askubuntu CQA platform: +(1) retrieval of duplicate questions given a new question and (2) duplicate +question confirmation time prediction. In the first task, we focus on +retrieving duplicate questions from a question pool for a particular newly +posted question. In the second task, we solve a regression problem to rank a +pair of questions that could potentially take a long time to get confirmed as +duplicates. For duplicate question retrieval, we propose a Siamese neural +network based approach by exploiting both text and network-based features, +which outperforms several state-of-the-art baseline techniques. Our method +outperforms DupPredictor and DUPE by 5% and 7% respectively. For duplicate +confirmation time prediction, we have used both the standard machine learning +models and neural network along with the text and graph-based features. We +obtain Spearman's rank correlation of 0.20 and 0.213 (statistically +significant) for text and graph based features respectively. + +
+
+ comment: Full paper accepted at ASONAM 2023: The 2023 IEEE/ACM International + Conference on Advances in Social Networks Analysis and Mining +
+
+
+
+
+ + ☆ Streamlined Data Fusion: Unleashing the Power of Linear Combination with + Minimal Relevance Judgments + + +
+ Linear combination is a potent data fusion method in information retrieval +tasks, thanks to its ability to adjust weights for diverse scenarios. However, +achieving optimal weight training has traditionally required manual relevance +judgments on a large percentage of documents, a labor-intensive and expensive +process. In this study, we investigate the feasibility of obtaining +near-optimal weights using a mere 20\%-50\% of relevant documents. Through +experiments on four TREC datasets, we find that weights trained with multiple +linear regression using this reduced set closely rival those obtained with +TREC's official "qrels." Our findings unlock the potential for more efficient +and affordable data fusion, empowering researchers and practitioners to reap +its full benefits with significantly less effort. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ Multi-modal Extreme Classification + + +
+ This paper develops the MUFIN technique for extreme classification (XC) tasks +with millions of labels where datapoints and labels are endowed with visual and +textual descriptors. Applications of MUFIN to product-to-product recommendation +and bid query prediction over several millions of products are presented. +Contemporary multi-modal methods frequently rely on purely embedding-based +methods. On the other hand, XC methods utilize classifier architectures to +offer superior accuracies than embedding only methods but mostly focus on +text-based categorization tasks. MUFIN bridges this gap by reformulating +multi-modal categorization as an XC problem with several millions of labels. +This presents the twin challenges of developing multi-modal architectures that +can offer embeddings sufficiently expressive to allow accurate categorization +over millions of labels; and training and inference routines that scale +logarithmically in the number of labels. MUFIN develops an architecture based +on cross-modal attention and trains it in a modular fashion using pre-training +and positive and negative mining. A novel product-to-product recommendation +dataset MM-AmazonTitles-300K containing over 300K products was curated from +publicly available amazon.com listings with each product endowed with a title +and multiple images. On the all datasets MUFIN offered at least 3% higher +accuracy than leading text-based, image-based and multi-modal techniques. Code +for MUFIN is available at https://github.com/Extreme-classification/MUFIN + +
+
+
+
+
+ + ☆ A multiple k-means cluster ensemble framework for clustering citation + trajectories + + +
+ Citation maturity time varies for different articles. However, the impact of +all articles is measured in a fixed window. Clustering their citation +trajectories helps understand the knowledge diffusion process and reveals that +not all articles gain immediate success after publication. Moreover, clustering +trajectories is necessary for paper impact recommendation algorithms. It is a +challenging problem because citation time series exhibit significant +variability due to non linear and non stationary characteristics. Prior works +propose a set of arbitrary thresholds and a fixed rule based approach. All +methods are primarily parameter dependent. Consequently, it leads to +inconsistencies while defining similar trajectories and ambiguities regarding +their specific number. Most studies only capture extreme trajectories. Thus, a +generalised clustering framework is required. This paper proposes a feature +based multiple k means cluster ensemble framework. 1,95,783 and 41,732 well +cited articles from the Microsoft Academic Graph data are considered for +clustering short term (10 year) and long term (30 year) trajectories, +respectively. It has linear run time. Four distinct trajectories are obtained +Early Rise Rapid Decline (2.2%), Early Rise Slow Decline (45%), Delayed Rise No +Decline (53%), and Delayed Rise Slow Decline (0.8%). Individual trajectory +differences for two different spans are studied. Most papers exhibit Early Rise +Slow Decline and Delayed Rise No Decline patterns. The growth and decay times, +cumulative citation distribution, and peak characteristics of individual +trajectories are redefined empirically. A detailed comparative study reveals +our proposed methodology can detect all distinct trajectory classes. + +
+
+ comment: 29 pages +
+
+
+
+
+
+
+
+ + Machine Learning 45 + +
+
+
+ + ☆ Learning Energy-Based Models by Cooperative Diffusion Recovery + Likelihood + + +
+ Training energy-based models (EBMs) with maximum likelihood estimation on +high-dimensional data can be both challenging and time-consuming. As a result, +there a noticeable gap in sample quality between EBMs and other generative +frameworks like GANs and diffusion models. To close this gap, inspired by the +recent efforts of learning EBMs by maximimizing diffusion recovery likelihood +(DRL), we propose cooperative diffusion recovery likelihood (CDRL), an +effective approach to tractably learn and sample from a series of EBMs defined +on increasingly noisy versons of a dataset, paired with an initializer model +for each EBM. At each noise level, the initializer model learns to amortize the +sampling process of the EBM, and the two models are jointly estimated within a +cooperative training framework. Samples from the initializer serve as starting +points that are refined by a few sampling steps from the EBM. With the refined +samples, the EBM is optimized by maximizing recovery likelihood, while the +initializer is optimized by learning from the difference between the refined +samples and the initial samples. We develop a new noise schedule and a variance +reduction technique to further improve the sample quality. Combining these +advances, we significantly boost the FID scores compared to existing EBM +methods on CIFAR-10 and ImageNet 32x32, with a 2x speedup over DRL. In +addition, we extend our method to compositional generation and image inpainting +tasks, and showcase the compatibility of CDRL with classifier-free guidance for +conditional generation, achieving similar trade-offs between sample quality and +sample diversity as in diffusion models. + +
+
+
+
+
+ + ☆ Faster, Lighter, More Accurate: A Deep Learning Ensemble for Content + Moderation ICML + + +
+ To address the increasing need for efficient and accurate content moderation, +we propose an efficient and lightweight deep classification ensemble structure. +Our approach is based on a combination of simple visual features, designed for +high-accuracy classification of violent content with low false positives. Our +ensemble architecture utilizes a set of lightweight models with narrowed-down +color features, and we apply it to both images and videos. + We evaluated our approach using a large dataset of explosion and blast +contents and compared its performance to popular deep learning models such as +ResNet-50. Our evaluation results demonstrate significant improvements in +prediction accuracy, while benefiting from 7.64x faster inference and lower +computation cost. + While our approach is tailored to explosion detection, it can be applied to +other similar content moderation and violence detection use cases as well. +Based on our experiments, we propose a "think small, think many" philosophy in +classification scenarios. We argue that transforming a single, large, +monolithic deep model into a verification-based step model ensemble of multiple +small, simple, and lightweight models with narrowed-down visual features can +possibly lead to predictions with higher accuracy. + +
+
+ comment: 6 pages, 22nd IEEE International Conference on Machine Learning and + Applications (IEEE ICMLA'23), December 15-17, 2023, Jacksonville Riverfront, + Florida, USA. arXiv admin note: substantial text overlap with + arXiv:2103.10350 +
+
+
+
+
+ + ☆ Outlier Robust Adversarial Training ACML 2023 + + +
+ Supervised learning models are challenged by the intrinsic complexities of +training data such as outliers and minority subpopulations and intentional +attacks at inference time with adversarial samples. While traditional robust +learning methods and the recent adversarial training approaches are designed to +handle each of the two challenges, to date, no work has been done to develop +models that are robust with regard to the low-quality training data and the +potential adversarial attack at inference time simultaneously. It is for this +reason that we introduce Outlier Robust Adversarial Training (ORAT) in this +work. ORAT is based on a bi-level optimization formulation of adversarial +training with a robust rank-based loss function. Theoretically, we show that +the learning objective of ORAT satisfies the $\mathcal{H}$-consistency in +binary classification, which establishes it as a proper surrogate to +adversarial 0/1 loss. Furthermore, we analyze its generalization ability and +provide uniform convergence rates in high probability. ORAT can be optimized +with a simple algorithm. Experimental evaluations on three benchmark datasets +demonstrate the effectiveness and robustness of ORAT in handling outliers and +adversarial attacks. Our code is available at +https://github.com/discovershu/ORAT. + +
+
+ comment: Accepted by The 15th Asian Conference on Machine Learning (ACML 2023) +
+
+
+
+
+ + ☆ DAD++: Improved Data-free Test Time Adversarial Defense + + +
+ With the increasing deployment of deep neural networks in safety-critical +applications such as self-driving cars, medical imaging, anomaly detection, +etc., adversarial robustness has become a crucial concern in the reliability of +these networks in real-world scenarios. A plethora of works based on +adversarial training and regularization-based techniques have been proposed to +make these deep networks robust against adversarial attacks. However, these +methods require either retraining models or training them from scratch, making +them infeasible to defend pre-trained models when access to training data is +restricted. To address this problem, we propose a test time Data-free +Adversarial Defense (DAD) containing detection and correction frameworks. +Moreover, to further improve the efficacy of the correction framework in cases +when the detector is under-confident, we propose a soft-detection scheme +(dubbed as "DAD++"). We conduct a wide range of experiments and ablations on +several datasets and network architectures to show the efficacy of our proposed +approach. Furthermore, we demonstrate the applicability of our approach in +imparting adversarial defense at test time under data-free (or data-efficient) +applications/setups, such as Data-free Knowledge Distillation and Source-free +Unsupervised Domain Adaptation, as well as Semi-supervised classification +frameworks. We observe that in all the experiments and applications, our DAD++ +gives an impressive performance against various adversarial attacks with a +minimal drop in clean accuracy. The source code is available at: +https://github.com/vcl-iisc/Improved-Data-free-Test-Time-Adversarial-Defense + +
+
+ comment: IJCV Journal (Under Review) +
+
+
+
+
+ + ☆ Signal Temporal Logic Neural Predictive Control ICRA2024 + + +
+ Ensuring safety and meeting temporal specifications are critical challenges +for long-term robotic tasks. Signal temporal logic (STL) has been widely used +to systematically and rigorously specify these requirements. However, +traditional methods of finding the control policy under those STL requirements +are computationally complex and not scalable to high-dimensional or systems +with complex nonlinear dynamics. Reinforcement learning (RL) methods can learn +the policy to satisfy the STL specifications via hand-crafted or STL-inspired +rewards, but might encounter unexpected behaviors due to ambiguity and sparsity +in the reward. In this paper, we propose a method to directly learn a neural +network controller to satisfy the requirements specified in STL. Our controller +learns to roll out trajectories to maximize the STL robustness score in +training. In testing, similar to Model Predictive Control (MPC), the learned +controller predicts a trajectory within a planning horizon to ensure the +satisfaction of the STL requirement in deployment. A backup policy is designed +to ensure safety when our controller fails. Our approach can adapt to various +initial conditions and environmental parameters. We conduct experiments on six +tasks, where our method with the backup policy outperforms the classical +methods (MPC, STL-solver), model-free and model-based RL methods in STL +satisfaction rate, especially on tasks with complex STL specifications while +being 10X-100X faster than the classical methods. + +
+
+ comment: Accepted by IEEE Robotics and Automation Letters (RA-L) and ICRA2024 +
+
+
+
+
+ + ☆ The online learning architecture with edge computing for high-level + control for assisting patients + + +
+ The prevalence of mobility impairments due to conditions such as spinal cord +injuries, strokes, and degenerative diseases is on the rise globally. +Lower-limb exoskeletons have been increasingly recognized as a viable solution +for enhancing mobility and rehabilitation for individuals with such +impairments. However, existing exoskeleton control systems often suffer from +limitations such as latency, lack of adaptability, and computational +inefficiency. To address these challenges, this paper introduces a novel online +adversarial learning architecture integrated with edge computing for high-level +lower-limb exoskeleton control. In the proposed architecture, sensor data from +the user is processed in real-time through edge computing nodes, which then +interact with an online adversarial learning model. This model adapts to the +user's specific needs and controls the exoskeleton with minimal latency. +Experimental evaluations demonstrate significant improvements in control +accuracy and adaptability, as well as enhanced quality-of-service (QoS) +metrics. These findings indicate that the integration of online adversarial +learning with edge computing offers a robust and efficient approach for the +next generation of lower-limb exoskeleton control systems. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Nonlinear Granger Causality using Kernel Ridge Regression + + +
+ I introduce a novel algorithm and accompanying Python library, named +mlcausality, designed for the identification of nonlinear Granger causal +relationships. This novel algorithm uses a flexible plug-in architecture that +enables researchers to employ any nonlinear regressor as the base prediction +model. Subsequently, I conduct a comprehensive performance analysis of +mlcausality when the prediction regressor is the kernel ridge regressor with +the radial basis function kernel. The results demonstrate that mlcausality +employing kernel ridge regression achieves competitive AUC scores across a +diverse set of simulated data. Furthermore, mlcausality with kernel ridge +regression yields more finely calibrated $p$-values in comparison to rival +algorithms. This enhancement enables mlcausality to attain superior accuracy +scores when using intuitive $p$-value-based thresholding criteria. Finally, +mlcausality with the kernel ridge regression exhibits significantly reduced +computation times compared to existing nonlinear Granger causality algorithms. +In fact, in numerous instances, this innovative approach achieves superior +solutions within computational timeframes that are an order of magnitude +shorter than those required by competing algorithms. + +
+
+
+
+
+ + ☆ Convex Q Learning in a Stochastic Environment: Extended Version + + +
+ The paper introduces the first formulation of convex Q-learning for Markov +decision processes with function approximation. The algorithms and theory rest +on a relaxation of a dual of Manne's celebrated linear programming +characterization of optimal control. The main contributions firstly concern +properties of the relaxation, described as a deterministic convex program: we +identify conditions for a bounded solution, and a significant relationship +between the solution to the new convex program, and the solution to standard +Q-learning. The second set of contributions concern algorithm design and +analysis: (i) A direct model-free method for approximating the convex program +for Q-learning shares properties with its ideal. In particular, a bounded +solution is ensured subject to a simple property of the basis functions; (ii) +The proposed algorithms are convergent and new techniques are introduced to +obtain the rate of convergence in a mean-square sense; (iii) The approach can +be generalized to a range of performance criteria, and it is found that +variance can be reduced by considering ``relative'' dynamic programming +equations; (iv) The theory is illustrated with an application to a classical +inventory control problem. + +
+
+ comment: Extended version of "Convex Q-learning in a stochastic environment", + IEEE Conference on Decision and Control, 2023 (to appear) +
+
+
+
+
+ + ☆ Is Learning in Biological Neural Networks based on Stochastic Gradient + Descent? An analysis using stochastic processes + + +
+ In recent years, there has been an intense debate about how learning in +biological neural networks (BNNs) differs from learning in artificial neural +networks. It is often argued that the updating of connections in the brain +relies only on local information, and therefore a stochastic gradient-descent +type optimization method cannot be used. In this paper, we study a stochastic +model for supervised learning in BNNs. We show that a (continuous) gradient +step occurs approximately when each learning opportunity is processed by many +local updates. This result suggests that stochastic gradient descent may indeed +play a role in optimizing BNNs. + +
+
+
+
+
+ + ☆ Adaptive conformal classification with noisy labels + + +
+ This paper develops novel conformal prediction methods for classification +tasks that can automatically adapt to random label contamination in the +calibration sample, enabling more informative prediction sets with stronger +coverage guarantees compared to state-of-the-art approaches. This is made +possible by a precise theoretical characterization of the effective coverage +inflation (or deflation) suffered by standard conformal inferences in the +presence of label contamination, which is then made actionable through new +calibration algorithms. Our solution is flexible and can leverage different +modeling assumptions about the label contamination process, while requiring no +knowledge about the data distribution or the inner workings of the +machine-learning classifier. The advantages of the proposed methods are +demonstrated through extensive simulations and an application to object +classification with the CIFAR-10H image data set. + +
+
+ comment: 35 pages (98 pages including references and appendices) +
+
+
+
+
+ + ☆ A supervised generative optimization approach for tabular data + + +
+ Synthetic data generation has emerged as a crucial topic for financial +institutions, driven by multiple factors, such as privacy protection and data +augmentation. Many algorithms have been proposed for synthetic data generation +but reaching the consensus on which method we should use for the specific data +sets and use cases remains challenging. Moreover, the majority of existing +approaches are ``unsupervised'' in the sense that they do not take into account +the downstream task. To address these issues, this work presents a novel +synthetic data generation framework. The framework integrates a supervised +component tailored to the specific downstream task and employs a meta-learning +approach to learn the optimal mixture distribution of existing synthetic +distributions. + +
+
+
+
+
+ + ☆ Generalization error bounds for iterative learning algorithms with + bounded updates + + +
+ This paper explores the generalization characteristics of iterative learning +algorithms with bounded updates for non-convex loss functions, employing +information-theoretic techniques. Our key contribution is a novel bound for the +generalization error of these algorithms with bounded updates, extending beyond +the scope of previous works that only focused on Stochastic Gradient Descent +(SGD). Our approach introduces two main novelties: 1) we reformulate the mutual +information as the uncertainty of updates, providing a new perspective, and 2) +instead of using the chaining rule of mutual information, we employ a variance +decomposition technique to decompose information across iterations, allowing +for a simpler surrogate process. We analyze our generalization bound under +various settings and demonstrate improved bounds when the model dimension +increases at the same rate as the number of training data samples. To bridge +the gap between theory and practice, we also examine the previously observed +scaling behavior in large language models. Ultimately, our work takes a further +step for developing practical generalization theories. + +
+
+
+
+
+ + ☆ Spatiotemporal Graph Neural Networks with Uncertainty Quantification for + Traffic Incident Risk Prediction + + +
+ Predicting traffic incident risks at granular spatiotemporal levels is +challenging. The datasets predominantly feature zero values, indicating no +incidents, with sporadic high-risk values for severe incidents. Notably, a +majority of current models, especially deep learning methods, focus solely on +estimating risk values, overlooking the uncertainties arising from the +inherently unpredictable nature of incidents. To tackle this challenge, we +introduce the Spatiotemporal Zero-Inflated Tweedie Graph Neural Networks +(STZITD-GNNs). Our model merges the reliability of traditional statistical +models with the flexibility of graph neural networks, aiming to precisely +quantify uncertainties associated with road-level traffic incident risks. This +model strategically employs a compound model from the Tweedie family, as a +Poisson distribution to model risk frequency and a Gamma distribution to +account for incident severity. Furthermore, a zero-inflated component helps to +identify the non-incident risk scenarios. As a result, the STZITD-GNNs +effectively capture the dataset's skewed distribution, placing emphasis on +infrequent but impactful severe incidents. Empirical tests using real-world +traffic data from London, UK, demonstrate that our model excels beyond current +benchmarks. The forte of STZITD-GNN resides not only in its accuracy but also +in its adeptness at curtailing uncertainties, delivering robust predictions +over short (7 days) and extended (14 days) timeframes. + +
+
+
+
+
+ + ☆ Mutation-based Fault Localization of Deep Neural Networks + + +
+ Deep neural networks (DNNs) are susceptible to bugs, just like other types of +software systems. A significant uptick in using DNN, and its applications in +wide-ranging areas, including safety-critical systems, warrant extensive +research on software engineering tools for improving the reliability of +DNN-based systems. One such tool that has gained significant attention in the +recent years is DNN fault localization. This paper revisits mutation-based +fault localization in the context of DNN models and proposes a novel technique, +named deepmufl, applicable to a wide range of DNN models. We have implemented +deepmufl and have evaluated its effectiveness using 109 bugs obtained from +StackOverflow. Our results show that deepmufl detects 53/109 of the bugs by +ranking the buggy layer in top-1 position, outperforming state-of-the-art +static and dynamic DNN fault localization systems that are also designed to +target the class of bugs supported by deepmufl. Moreover, we observed that we +can halve the fault localization time for a pre-trained model using mutation +selection, yet losing only 7.55% of the bugs localized in top-1 position. + +
+
+ comment: 38th IEEE/ACM International Conference on Automated Software + Engineering (ASE 2023) +
+
+
+
+
+ + ☆ Federated Learning Incentive Mechanism under Buyers' Auction Market + + +
+ Auction-based Federated Learning (AFL) enables open collaboration among +self-interested data consumers and data owners. Existing AFL approaches are +commonly under the assumption of sellers' market in that the service clients as +sellers are treated as scarce resources so that the aggregation servers as +buyers need to compete the bids. Yet, as the technology progresses, an +increasing number of qualified clients are now capable of performing federated +learning tasks, leading to shift from sellers' market to a buyers' market. In +this paper, we shift the angle by adapting the procurement auction framework, +aiming to explain the pricing behavior under buyers' market. Our modeling +starts with basic setting under complete information, then move further to the +scenario where sellers' information are not fully observable. In order to +select clients with high reliability and data quality, and to prevent from +external attacks, we utilize a blockchain-based reputation mechanism. The +experimental results validate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ Machine Learning for maximizing the memristivity of single and coupled + quantum memristors + + +
+ We propose machine learning (ML) methods to characterize the memristive +properties of single and coupled quantum memristors. We show that maximizing +the memristivity leads to large values in the degree of entanglement of two +quantum memristors, unveiling the close relationship between quantum +correlations and memory. Our results strengthen the possibility of using +quantum memristors as key components of neuromorphic quantum computing. + +
+
+
+
+
+ + ☆ SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models + + +
+ Diffusion Probabilistic Models (DPMs) have achieved considerable success in +generation tasks. As sampling from DPMs is equivalent to solving diffusion SDE +or ODE which is time-consuming, numerous fast sampling methods built upon +improved differential equation solvers are proposed. The majority of such +techniques consider solving the diffusion ODE due to its superior efficiency. +However, stochastic sampling could offer additional advantages in generating +diverse and high-quality data. In this work, we engage in a comprehensive +analysis of stochastic sampling from two aspects: variance-controlled diffusion +SDE and linear multi-step SDE solver. Based on our analysis, we propose +SA-Solver, which is an improved efficient stochastic Adams method for solving +diffusion SDE to generate data with high quality. Our experiments show that +SA-Solver achieves: 1) improved or comparable performance compared with the +existing state-of-the-art sampling methods for few-step sampling; 2) SOTA FID +scores on substantial benchmark datasets under a suitable number of function +evaluations (NFEs). + +
+
+
+
+
+ + ☆ Linear Speedup of Incremental Aggregated Gradient Methods on Streaming + Data + + +
+ This paper considers a type of incremental aggregated gradient (IAG) method +for large-scale distributed optimization. The IAG method is well suited for the +parameter server architecture as the latter can easily aggregate potentially +staled gradients contributed by workers. Although the convergence of IAG in the +case of deterministic gradient is well known, there are only a few results for +the case of its stochastic variant based on streaming data. Considering +strongly convex optimization, this paper shows that the streaming IAG method +achieves linear speedup when the workers are updating frequently enough, even +if the data sample distribution across workers are heterogeneous. We show that +the expected squared distance to optimal solution decays at O((1+T)/(nt)), +where $n$ is the number of workers, t is the iteration number, and T/n is the +update frequency of workers. Our analysis involves careful treatments of the +conditional expectations with staled gradients and a recursive system with both +delayed and noise terms, which are new to the analysis of IAG-type algorithms. +Numerical results are presented to verify our findings. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ AVARS -- Alleviating Unexpected Urban Road Traffic Congestion using UAVs + + +
+ Reducing unexpected urban traffic congestion caused by en-route events (e.g., +road closures, car crashes, etc.) often requires fast and accurate reactions to +choose the best-fit traffic signals. Traditional traffic light control systems, +such as SCATS and SCOOT, are not efficient as their traffic data provided by +induction loops has a low update frequency (i.e., longer than 1 minute). +Moreover, the traffic light signal plans used by these systems are selected +from a limited set of candidate plans pre-programmed prior to unexpected +events' occurrence. Recent research demonstrates that camera-based traffic +light systems controlled by deep reinforcement learning (DRL) algorithms are +more effective in reducing traffic congestion, in which the cameras can provide +high-frequency high-resolution traffic data. However, these systems are costly +to deploy in big cities due to the excessive potential upgrades required to +road infrastructure. In this paper, we argue that Unmanned Aerial Vehicles +(UAVs) can play a crucial role in dealing with unexpected traffic congestion +because UAVs with onboard cameras can be economically deployed when and where +unexpected congestion occurs. Then, we propose a system called "AVARS" that +explores the potential of using UAVs to reduce unexpected urban traffic +congestion using DRL-based traffic light signal control. This approach is +validated on a widely used open-source traffic simulator with practical UAV +settings, including its traffic monitoring ranges and battery lifetime. Our +simulation results show that AVARS can effectively recover the unexpected +traffic congestion in Dublin, Ireland, back to its original un-congested level +within the typical battery life duration of a UAV. + +
+
+
+
+
+ + ☆ Continual Robot Learning using Self-Supervised Task Inference + + +
+ Endowing robots with the human ability to learn a growing set of skills over +the course of a lifetime as opposed to mastering single tasks is an open +problem in robot learning. While multi-task learning approaches have been +proposed to address this problem, they pay little attention to task inference. +In order to continually learn new tasks, the robot first needs to infer the +task at hand without requiring predefined task representations. In this paper, +we propose a self-supervised task inference approach. Our approach learns +action and intention embeddings from self-organization of the observed movement +and effect parts of unlabeled demonstrations and a higher-level behavior +embedding from self-organization of the joint action-intention embeddings. We +construct a behavior-matching self-supervised learning objective to train a +novel Task Inference Network (TINet) to map an unlabeled demonstration to its +nearest behavior embedding, which we use as the task representation. A +multi-task policy is built on top of the TINet and trained with reinforcement +learning to optimize performance over tasks. We evaluate our approach in the +fixed-set and continual multi-task learning settings with a humanoid robot and +compare it to different multi-task learning baselines. The results show that +our approach outperforms the other baselines, with the difference being more +pronounced in the challenging continual learning setting, and can infer tasks +from incomplete demonstrations. Our approach is also shown to generalize to +unseen tasks based on a single demonstration in one-shot task generalization +experiments. + +
+
+ comment: Accepted for publication in IEEE Transactions on Cognitive and + Developmental Systems +
+
+
+
+
+ + ☆ LMBiS-Net: A Lightweight Multipath Bidirectional Skip Connection based + CNN for Retinal Blood Vessel Segmentation + + +
+ Blinding eye diseases are often correlated with altered retinal morphology, +which can be clinically identified by segmenting retinal structures in fundus +images. However, current methodologies often fall short in accurately +segmenting delicate vessels. Although deep learning has shown promise in +medical image segmentation, its reliance on repeated convolution and pooling +operations can hinder the representation of edge information, ultimately +limiting overall segmentation accuracy. In this paper, we propose a lightweight +pixel-level CNN named LMBiS-Net for the segmentation of retinal vessels with an +exceptionally low number of learnable parameters \textbf{(only 0.172 M)}. The +network used multipath feature extraction blocks and incorporates bidirectional +skip connections for the information flow between the encoder and decoder. +Additionally, we have optimized the efficiency of the model by carefully +selecting the number of filters to avoid filter overlap. This optimization +significantly reduces training time and enhances computational efficiency. To +assess the robustness and generalizability of LMBiS-Net, we performed +comprehensive evaluations on various aspects of retinal images. Specifically, +the model was subjected to rigorous tests to accurately segment retinal +vessels, which play a vital role in ophthalmological diagnosis and treatment. +By focusing on the retinal blood vessels, we were able to thoroughly analyze +the performance and effectiveness of the LMBiS-Net model. The results of our +tests demonstrate that LMBiS-Net is not only robust and generalizable but also +capable of maintaining high levels of segmentation accuracy. These +characteristics highlight the potential of LMBiS-Net as an efficient tool for +high-speed and accurate segmentation of retinal images in various clinical +applications. + +
+
+
+
+
+ + ☆ A multiple k-means cluster ensemble framework for clustering citation + trajectories + + +
+ Citation maturity time varies for different articles. However, the impact of +all articles is measured in a fixed window. Clustering their citation +trajectories helps understand the knowledge diffusion process and reveals that +not all articles gain immediate success after publication. Moreover, clustering +trajectories is necessary for paper impact recommendation algorithms. It is a +challenging problem because citation time series exhibit significant +variability due to non linear and non stationary characteristics. Prior works +propose a set of arbitrary thresholds and a fixed rule based approach. All +methods are primarily parameter dependent. Consequently, it leads to +inconsistencies while defining similar trajectories and ambiguities regarding +their specific number. Most studies only capture extreme trajectories. Thus, a +generalised clustering framework is required. This paper proposes a feature +based multiple k means cluster ensemble framework. 1,95,783 and 41,732 well +cited articles from the Microsoft Academic Graph data are considered for +clustering short term (10 year) and long term (30 year) trajectories, +respectively. It has linear run time. Four distinct trajectories are obtained +Early Rise Rapid Decline (2.2%), Early Rise Slow Decline (45%), Delayed Rise No +Decline (53%), and Delayed Rise Slow Decline (0.8%). Individual trajectory +differences for two different spans are studied. Most papers exhibit Early Rise +Slow Decline and Delayed Rise No Decline patterns. The growth and decay times, +cumulative citation distribution, and peak characteristics of individual +trajectories are redefined empirically. A detailed comparative study reveals +our proposed methodology can detect all distinct trajectory classes. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Distance-Restricted Folklore Weisfeiler-Leman GNNs with Provable Cycle + Counting Power + + +
+ The ability of graph neural networks (GNNs) to count certain graph +substructures, especially cycles, is important for the success of GNNs on a +wide range of tasks. It has been recently used as a popular metric for +evaluating the expressive power of GNNs. Many of the proposed GNN models with +provable cycle counting power are based on subgraph GNNs, i.e., extracting a +bag of subgraphs from the input graph, generating representations for each +subgraph, and using them to augment the representation of the input graph. +However, those methods require heavy preprocessing, and suffer from high time +and memory costs. In this paper, we overcome the aforementioned limitations of +subgraph GNNs by proposing a novel class of GNNs -- $d$-Distance-Restricted +FWL(2) GNNs, or $d$-DRFWL(2) GNNs. $d$-DRFWL(2) GNNs use node pairs whose +mutual distances are at most $d$ as the units for message passing to balance +the expressive power and complexity. By performing message passing among +distance-restricted node pairs in the original graph, $d$-DRFWL(2) GNNs avoid +the expensive subgraph extraction operations in subgraph GNNs, making both the +time and space complexity lower. We theoretically show that the discriminative +power of $d$-DRFWL(2) GNNs strictly increases as $d$ increases. More +importantly, $d$-DRFWL(2) GNNs have provably strong cycle counting power even +with $d=2$: they can count all 3, 4, 5, 6-cycles. Since 6-cycles (e.g., benzene +rings) are ubiquitous in organic molecules, being able to detect and count them +is crucial for achieving robust and generalizable performance on molecular +tasks. Experiments on both synthetic datasets and molecular datasets verify our +theory. To the best of our knowledge, our model is the most efficient GNN model +to date (both theoretically and empirically) that can count up to 6-cycles. + +
+
+
+
+
+ + ☆ A Review of Machine Learning-based Security in Cloud Computing + + +
+ Cloud Computing (CC) is revolutionizing the way IT resources are delivered to +users, allowing them to access and manage their systems with increased +cost-effectiveness and simplified infrastructure. However, with the growth of +CC comes a host of security risks, including threats to availability, +integrity, and confidentiality. To address these challenges, Machine Learning +(ML) is increasingly being used by Cloud Service Providers (CSPs) to reduce the +need for human intervention in identifying and resolving security issues. With +the ability to analyze vast amounts of data, and make high-accuracy +predictions, ML can transform the way CSPs approach security. In this paper, we +will explore some of the most recent research in the field of ML-based security +in Cloud Computing. We will examine the features and effectiveness of a range +of ML algorithms, highlighting their unique strengths and potential +limitations. Our goal is to provide a comprehensive overview of the current +state of ML in cloud security and to shed light on the exciting possibilities +that this emerging field has to offer. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Massively Scalable Inverse Reinforcement Learning in Google Maps + + +
+ Optimizing for humans' latent preferences remains a grand challenge in route +recommendation. Prior research has provided increasingly general techniques +based on inverse reinforcement learning (IRL), yet no approach has been +successfully scaled to world-sized routing problems with hundreds of millions +of states and demonstration trajectories. In this paper, we provide methods for +scaling IRL using graph compression, spatial parallelization, and problem +initialization based on dominant eigenvectors. We revisit classic algorithms +and study them in a large-scale setting, and make the key observation that +there exists a trade-off between the use of cheap, deterministic planners and +expensive yet robust stochastic policies. We leverage this insight in Receding +Horizon Inverse Planning (RHIP), a new generalization of classic IRL algorithms +that provides fine-grained control over performance trade-offs via its planning +horizon. Our contributions culminate in a policy that achieves a 16-24% +improvement in global route quality, and to the best of our knowledge, +represents the largest instance of IRL in a real-world setting to date. +Benchmark results show critical benefits to more sustainable modes of +transportation, where factors beyond journey time play a substantial role. We +conclude by conducting an ablation study of key components, presenting negative +results from alternative eigenvalue solvers, and identifying opportunities to +further improve scalability via IRL-specific batching strategies. + +
+
+
+
+
+ + ♻ ☆ Improved Aircraft Environmental Impact Segmentation via Metric Learning + + +
+ Accurate modeling of aircraft environmental impact is pivotal to the design +of operational procedures and policies to mitigate negative aviation +environmental impact. Aircraft environmental impact segmentation is a process +which clusters aircraft types that have similar environmental impact +characteristics based on a set of aircraft features. This practice helps model +a large population of aircraft types with insufficient aircraft noise and +performance models and contributes to better understanding of aviation +environmental impact. Through measuring the similarity between aircraft types, +distance metric is the kernel of aircraft segmentation. Traditional ways of +aircraft segmentation use plain distance metrics and assign equal weight to all +features in an unsupervised clustering process. In this work, we utilize +weakly-supervised metric learning and partial information on aircraft fuel +burn, emissions, and noise to learn weighted distance metrics for aircraft +environmental impact segmentation. We show in a comprehensive case study that +the tailored distance metrics can indeed make aircraft segmentation better +reflect the actual environmental impact of aircraft. The metric learning +approach can help refine a number of similar data-driven analytical studies in +aviation. + +
+
+ comment: 32 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Deep Metric Learning for the Hemodynamics Inference with + Electrocardiogram Signals + + +
+ Heart failure is a debilitating condition that affects millions of people +worldwide and has a significant impact on their quality of life and mortality +rates. An objective assessment of cardiac pressures remains an important method +for the diagnosis and treatment prognostication for patients with heart +failure. Although cardiac catheterization is the gold standard for estimating +central hemodynamic pressures, it is an invasive procedure that carries +inherent risks, making it a potentially dangerous procedure for some patients. +Approaches that leverage non-invasive signals - such as electrocardiogram (ECG) +- have the promise to make the routine estimation of cardiac pressures feasible +in both inpatient and outpatient settings. Prior models trained to estimate +intracardiac pressures (e.g., mean pulmonary capillary wedge pressure (mPCWP)) +in a supervised fashion have shown good discriminatory ability but have been +limited to the labeled dataset from the heart failure cohort. To address this +issue and build a robust representation, we apply deep metric learning (DML) +and propose a novel self-supervised DML with distance-based mining that +improves the performance of a model with limited labels. We use a dataset that +contains over 5.4 million ECGs without concomitant central pressure labels to +pre-train a self-supervised DML model which showed improved classification of +elevated mPCWP compared to self-supervised contrastive baselines. Additionally, +the supervised DML model that uses ECGs with access to 8,172 mPCWP labels +demonstrated significantly better performance on the mPCWP regression task +compared to the supervised baseline. Moreover, our data suggest that DML yields +models that are performant across patient subgroups, even when some patient +subgroups are under-represented in the dataset. Our code is available at +https://github.com/mandiehyewon/ssldml + +
+
+
+
+
+ + ♻ ☆ ResNet After All? Neural ODEs and Their Numerical Solution + + +
+ A key appeal of the recently proposed Neural Ordinary Differential Equation +(ODE) framework is that it seems to provide a continuous-time extension of +discrete residual neural networks. As we show herein, though, trained Neural +ODE models actually depend on the specific numerical method used during +training. If the trained model is supposed to be a flow generated from an ODE, +it should be possible to choose another numerical solver with equal or smaller +numerical error without loss of performance. We observe that if training relies +on a solver with overly coarse discretization, then testing with another solver +of equal or smaller numerical error results in a sharp drop in accuracy. In +such cases, the combination of vector field and numerical method cannot be +interpreted as a flow generated from an ODE, which arguably poses a fatal +breakdown of the Neural ODE concept. We observe, however, that there exists a +critical step size beyond which the training yields a valid ODE vector field. +We propose a method that monitors the behavior of the ODE solver during +training to adapt its step size, aiming to ensure a valid ODE without +unnecessarily increasing computational cost. We verify this adaptation +algorithm on a common bench mark dataset as well as a synthetic dataset. + +
+
+
+
+
+ + ♻ ☆ Discover, Explanation, Improvement: An Automatic Slice Detection + Framework for Natural Language Processing + + +
+ Pretrained natural language processing (NLP) models have achieved high +overall performance, but they still make systematic errors. Instead of manual +error analysis, research on slice detection models (SDM), which automatically +identify underperforming groups of datapoints, has caught escalated attention +in Computer Vision for both understanding model behaviors and providing +insights for future model training and designing. However, little research on +SDM and quantitative evaluation of their effectiveness have been conducted on +NLP tasks. Our paper fills the gap by proposing a benchmark named "Discover, +Explain, Improve (DEIM)" for classification NLP tasks along with a new SDM +Edisa. Edisa discovers coherent and underperforming groups of datapoints; DEIM +then unites them under human-understandable concepts and provides comprehensive +evaluation tasks and corresponding quantitative metrics. The evaluation in DEIM +shows that Edisa can accurately select error-prone datapoints with informative +semantic features that summarize error patterns. Detecting difficult datapoints +directly boosts model performance without tuning any original model parameters, +showing that discovered slices are actionable for users. + +
+
+ comment: 15 pages, 5 figures, accepted by Transactions of the Association for + Computational Linguistics +
+
+
+
+
+ + ♻ ☆ Domain-adapted Learning and Imitation: DRL for Power Arbitrage + + +
+ In this paper, we discuss the Dutch power market, which is comprised of a +day-ahead market and an intraday balancing market that operates like an +auction. Due to fluctuations in power supply and demand, there is often an +imbalance that leads to different prices in the two markets, providing an +opportunity for arbitrage. To address this issue, we restructure the problem +and propose a collaborative dual-agent reinforcement learning approach for this +bi-level simulation and optimization of European power arbitrage trading. We +also introduce two new implementations designed to incorporate domain-specific +knowledge by imitating the trading behaviours of power traders. By utilizing +reward engineering to imitate domain expertise, we are able to reform the +reward system for the RL agent, which improves convergence during training and +enhances overall performance. Additionally, the tranching of orders increases +bidding success rates and significantly boosts profit and loss (P&L). Our study +demonstrates that by leveraging domain expertise in a general learning problem, +the performance can be improved substantially, and the final integrated +approach leads to a three-fold improvement in cumulative P&L compared to the +original agent. Furthermore, our methodology outperforms the highest benchmark +policy by around 50% while maintaining efficient computational performance. + +
+
+
+
+
+ + ♻ ☆ Bayesian Numerical Integration with Neural Networks + + +
+ Bayesian probabilistic numerical methods for numerical integration offer +significant advantages over their non-Bayesian counterparts: they can encode +prior information about the integrand, and can quantify uncertainty over +estimates of an integral. However, the most popular algorithm in this class, +Bayesian quadrature, is based on Gaussian process models and is therefore +associated with a high computational cost. To improve scalability, we propose +an alternative approach based on Bayesian neural networks which we call +Bayesian Stein networks. The key ingredients are a neural network architecture +based on Stein operators, and an approximation of the Bayesian posterior based +on the Laplace approximation. We show that this leads to orders of magnitude +speed-ups on the popular Genz functions benchmark, and on challenging problems +arising in the Bayesian analysis of dynamical systems, and the prediction of +energy production for a large-scale wind farm. + +
+
+
+
+
+ + ♻ ☆ Attacking c-MARL More Effectively: A Data Driven Approach + + +
+ In recent years, a proliferation of methods were developed for cooperative +multi-agent reinforcement learning (c-MARL). However, the robustness of c-MARL +agents against adversarial attacks has been rarely explored. In this paper, we +propose to evaluate the robustness of c-MARL agents via a model-based approach, +named c-MBA. Our proposed formulation can craft much stronger adversarial state +perturbations of c-MARL agents to lower total team rewards than existing +model-free approaches. In addition, we propose the first victim-agent selection +strategy and the first data-driven approach to define targeted failure states +where each of them allows us to develop even stronger adversarial attack +without the expert knowledge to the underlying environment. Our numerical +experiments on two representative MARL benchmarks illustrate the advantage of +our approach over other baselines: our model-based attack consistently +outperforms other baselines in all tested environments. + +
+
+
+
+
+ + ♻ ☆ Efficient Generator of Mathematical Expressions for Symbolic Regression ECML + + +
+ We propose an approach to symbolic regression based on a novel variational +autoencoder for generating hierarchical structures, HVAE. It combines simple +atomic units with shared weights to recursively encode and decode the +individual nodes in the hierarchy. Encoding is performed bottom-up and decoding +top-down. We empirically show that HVAE can be trained efficiently with small +corpora of mathematical expressions and can accurately encode expressions into +a smooth low-dimensional latent space. The latter can be efficiently explored +with various optimization methods to address the task of symbolic regression. +Indeed, random search through the latent space of HVAE performs better than +random search through expressions generated by manually crafted probabilistic +grammars for mathematical expressions. Finally, EDHiE system for symbolic +regression, which applies an evolutionary algorithm to the latent space of +HVAE, reconstructs equations from a standard symbolic regression benchmark +better than a state-of-the-art system based on a similar combination of deep +learning and evolutionary algorithms.\v{z} + +
+
+ comment: 35 pages, 11 tables, 7 multi-part figures, Machine learning + (Springer) and journal track of ECML/PKDD 2023 +
+
+
+
+
+ + ♻ ☆ An Effective Transformer-based Contextual Model and Temporal Gate + Pooling for Speaker Identification + + +
+ Wav2vec2 has achieved success in applying Transformer architecture and +self-supervised learning to speech recognition. Recently, these have come to be +used not only for speech recognition but also for the entire speech processing. +This paper introduces an effective end-to-end speaker identification model +applied Transformer-based contextual model. We explored the relationship +between the hyper-parameters and the performance in order to discern the +structure of an effective model. Furthermore, we propose a pooling method, +Temporal Gate Pooling, with powerful learning ability for speaker +identification. We applied Conformer as encoder and BEST-RQ for pre-training +and conducted an evaluation utilizing the speaker identification of VoxCeleb1. +The proposed method has achieved an accuracy of 87.1% with 28.5M parameters, +demonstrating comparable precision to wav2vec2 with 317.7M parameters. Code is +available at https://github.com/HarunoriKawano/speaker-identification-with-tgp. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Comparative Analysis of Deep Learning Architectures for Breast Cancer + Diagnosis Using the BreaKHis Dataset + + +
+ Cancer is an extremely difficult and dangerous health problem because it +manifests in so many different ways and affects so many different organs and +tissues. The primary goal of this research was to evaluate deep learning +models' ability to correctly identify breast cancer cases using the BreakHis +dataset. The BreakHis dataset covers a wide range of breast cancer subtypes +through its huge collection of histopathological pictures. In this study, we +use and compare the performance of five well-known deep learning models for +cancer classification: VGG, ResNet, Xception, Inception, and InceptionResNet. +The results placed the Xception model at the top, with an F1 score of 0.9 and +an accuracy of 89%. At the same time, the Inception and InceptionResNet models +both hit accuracy of 87% . However, the F1 score for the Inception model was +87, while that for the InceptionResNet model was 86. These results demonstrate +the importance of deep learning methods in making correct breast cancer +diagnoses. This highlights the potential to provide improved diagnostic +services to patients. The findings of this study not only improve current +methods of cancer diagnosis, but also make significant contributions to the +creation of new and improved cancer treatment strategies. In a nutshell, the +results of this study represent a major advancement in the direction of +achieving these vital healthcare goals. + +
+
+ comment: 7 pages, 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ Provably Convergent Schrödinger Bridge with Applications to + Probabilistic Time Series Imputation ICML 2023 + + +
+ The Schr\"odinger bridge problem (SBP) is gaining increasing attention in +generative modeling and showing promising potential even in comparison with the +score-based generative models (SGMs). SBP can be interpreted as an +entropy-regularized optimal transport problem, which conducts projections onto +every other marginal alternatingly. However, in practice, only approximated +projections are accessible and their convergence is not well understood. To +fill this gap, we present a first convergence analysis of the Schr\"odinger +bridge algorithm based on approximated projections. As for its practical +applications, we apply SBP to probabilistic time series imputation by +generating missing values conditioned on observed data. We show that optimizing +the transport cost improves the performance and the proposed algorithm achieves +the state-of-the-art result in healthcare and environmental data while +exhibiting the advantage of exploring both temporal and feature patterns in +probabilistic time series imputation. + +
+
+ comment: Accepted by ICML 2023 +
+
+
+
+
+ + ♻ ☆ Variational Hierarchical Mixtures for Probabilistic Learning of Inverse + Dynamics + + +
+ Well-calibrated probabilistic regression models are a crucial learning +component in robotics applications as datasets grow rapidly and tasks become +more complex. Unfortunately, classical regression models are usually either +probabilistic kernel machines with a flexible structure that does not scale +gracefully with data or deterministic and vastly scalable automata, albeit with +a restrictive parametric form and poor regularization. In this paper, we +consider a probabilistic hierarchical modeling paradigm that combines the +benefits of both worlds to deliver computationally efficient representations +with inherent complexity regularization. The presented approaches are +probabilistic interpretations of local regression techniques that approximate +nonlinear functions through a set of local linear or polynomial units. +Importantly, we rely on principles from Bayesian nonparametrics to formulate +flexible models that adapt their complexity to the data and can potentially +encompass an infinite number of components. We derive two efficient variational +inference techniques to learn these representations and highlight the +advantages of hierarchical infinite local regression models, such as dealing +with non-smooth functions, mitigating catastrophic forgetting, and enabling +parameter sharing and fast predictions. Finally, we validate this approach on +large inverse dynamics datasets and test the learned models in real-world +control scenarios. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2011.05217 +
+
+
+
+
+ + ♻ ☆ Deep incremental learning models for financial temporal tabular datasets + with distribution shifts + + +
+ We present a robust deep incremental learning framework for regression tasks +on financial temporal tabular datasets which is built upon the incremental use +of commonly available tabular and time series prediction models to adapt to +distributional shifts typical of financial datasets. The framework uses a +simple basic building block (decision trees) to build self-similar models of +any required complexity to deliver robust performance under adverse situations +such as regime changes, fat-tailed distributions, and low signal-to-noise +ratios. As a detailed study, we demonstrate our scheme using XGBoost models +trained on the Numerai dataset and show that a two layer deep ensemble of +XGBoost models over different model snapshots delivers high quality predictions +under different market regimes. We also show that the performance of XGBoost +models with different number of boosting rounds in three scenarios (small, +standard and large) is monotonically increasing with respect to model size and +converges towards the generalisation upper bound. We also evaluate the +robustness of the model under variability of different hyperparameters, such as +model complexity and data sampling settings. Our model has low hardware +requirements as no specialised neural architectures are used and each base +model can be independently trained in parallel. + +
+
+
+
+
+ + ♻ ☆ QNNRepair: Quantized Neural Network Repair + + +
+ We present QNNRepair, the first method in the literature for repairing +quantized neural networks (QNNs). QNNRepair aims to improve the accuracy of a +neural network model after quantization. It accepts the full-precision and +weight-quantized neural networks and a repair dataset of passing and failing +tests. At first, QNNRepair applies a software fault localization method to +identify the neurons that cause performance degradation during neural network +quantization. Then, it formulates the repair problem into a linear programming +problem of solving neuron weights parameters, which corrects the QNN's +performance on failing tests while not compromising its performance on passing +tests. We evaluate QNNRepair with widely used neural network architectures such +as MobileNetV2, ResNet, and VGGNet on popular datasets, including +high-resolution images. We also compare QNNRepair with the state-of-the-art +data-free quantization method SQuant. According to the experiment results, we +conclude that QNNRepair is effective in improving the quantized model's +performance in most cases. Its repaired models have 24% higher accuracy than +SQuant's in the independent validation set, especially for the ImageNet +dataset. + +
+
+
+
+
+ + ♻ ☆ Sample-efficient Real-time Planning with Curiosity Cross-Entropy Method + and Contrastive Learning + + +
+ Model-based reinforcement learning (MBRL) with real-time planning has shown +great potential in locomotion and manipulation control tasks. However, the +existing planning methods, such as the Cross-Entropy Method (CEM), do not scale +well to complex high-dimensional environments. One of the key reasons for +underperformance is the lack of exploration, as these planning methods only aim +to maximize the cumulative extrinsic reward over the planning horizon. +Furthermore, planning inside the compact latent space in the absence of +observations makes it challenging to use curiosity-based intrinsic motivation. +We propose Curiosity CEM (CCEM), an improved version of the CEM algorithm for +encouraging exploration via curiosity. Our proposed method maximizes the sum of +state-action Q values over the planning horizon, in which these Q values +estimate the future extrinsic and intrinsic reward, hence encouraging reaching +novel observations. In addition, our model uses contrastive representation +learning to efficiently learn latent representations. Experiments on +image-based continuous control tasks from the DeepMind Control suite show that +CCEM is by a large margin more sample-efficient than previous MBRL algorithms +and compares favorably with the best model-free RL methods. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Why Do Facial Deepfake Detectors Fail? CCS 2023 + + +
+ Recent rapid advancements in deepfake technology have allowed the creation of +highly realistic fake media, such as video, image, and audio. These materials +pose significant challenges to human authentication, such as impersonation, +misinformation, or even a threat to national security. To keep pace with these +rapid advancements, several deepfake detection algorithms have been proposed, +leading to an ongoing arms race between deepfake creators and deepfake +detectors. Nevertheless, these detectors are often unreliable and frequently +fail to detect deepfakes. This study highlights the challenges they face in +detecting deepfakes, including (1) the pre-processing pipeline of artifacts and +(2) the fact that generators of new, unseen deepfake samples have not been +considered when building the defense models. Our work sheds light on the need +for further research and development in this field to create more robust and +reliable detectors. + +
+
+ comment: 5 pages, ACM ASIACCS 2023 +
+
+
+
+
+ + ♻ ☆ Improved theoretical guarantee for rank aggregation via spectral method + + +
+ Given pairwise comparisons between multiple items, how to rank them so that +the ranking matches the observations? This problem, known as rank aggregation, +has found many applications in sports, recommendation systems, and other web +applications. As it is generally NP-hard to find a global ranking that +minimizes the mismatch (known as the Kemeny optimization), we focus on the +Erd\"os-R\'enyi outliers (ERO) model for this ranking problem. Here, each +pairwise comparison is a corrupted copy of the true score difference. We +investigate spectral ranking algorithms that are based on unnormalized and +normalized data matrices. The key is to understand their performance in +recovering the underlying scores of each item from the observed data. This +reduces to deriving an entry-wise perturbation error bound between the top +eigenvectors of the unnormalized/normalized data matrix and its population +counterpart. By using the leave-one-out technique, we provide a sharper +$\ell_{\infty}$-norm perturbation bound of the eigenvectors and also derive an +error bound on the maximum displacement for each item, with only $\Omega(n\log +n)$ samples. Our theoretical analysis improves upon the state-of-the-art +results in terms of sample complexity, and our numerical experiments confirm +these theoretical findings. + +
+
+ comment: 29 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Deepfake in the Metaverse: Security Implications for Virtual Gaming, + Meetings, and Offices CCS 2023 + + +
+ The metaverse has gained significant attention from various industries due to +its potential to create a fully immersive and interactive virtual world. +However, the integration of deepfakes in the metaverse brings serious security +implications, particularly with regard to impersonation. This paper examines +the security implications of deepfakes in the metaverse, specifically in the +context of gaming, online meetings, and virtual offices. The paper discusses +how deepfakes can be used to impersonate in gaming scenarios, how online +meetings in the metaverse open the door for impersonation, and how virtual +offices in the metaverse lack physical authentication, making it easier for +attackers to impersonate someone. The implications of these security concerns +are discussed in relation to the confidentiality, integrity, and availability +(CIA) triad. The paper further explores related issues such as the darkverse, +and digital cloning, as well as regulatory and privacy concerns associated with +addressing security threats in the virtual world. + +
+
+ comment: 3 pages. Published to ACM ASIACCS 2023 workshop - The 2nd security + implications of Deepfakes and Cheapfakes +
+
+
+
+
+ + ♻ ☆ Objective-Agnostic Enhancement of Molecule Properties via Multi-Stage + VAE + + +
+ Variational autoencoder (VAE) is a popular method for drug discovery and +various architectures and pipelines have been proposed to improve its +performance. However, VAE approaches are known to suffer from poor manifold +recovery when the data lie on a low-dimensional manifold embedded in a higher +dimensional ambient space [Dai and Wipf, 2019]. The consequences of it in drug +discovery are somewhat under-explored. In this paper, we explore applying a +multi-stage VAE approach, that can improve manifold recovery on a synthetic +dataset, to the field of drug discovery. We experimentally evaluate our +multi-stage VAE approach using the ChEMBL dataset and demonstrate its ability +to improve the property statistics of generated molecules substantially from +pre-existing methods without incorporating property predictors into the +training pipeline. We further fine-tune our models on two curated and much +smaller molecule datasets that target different proteins. Our experiments show +an increase in the number of active molecules generated by the multi-stage VAE +in comparison to their one-stage equivalent. For each of the two tasks, our +baselines include methods that use learned property predictors to incorporate +target metrics directly into the training objective and we discuss +complications that arise with this methodology. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2212.02750 +
+
+
+
+
+ + ♻ ☆ Fat-Shattering Dimension of $k$-fold Aggregations + + +
+ We provide estimates on the fat-shattering dimension of aggregation rules of +real-valued function classes. The latter consists of all ways of choosing $k$ +functions, one from each of the $k$ classes, and computing a pointwise function +of them, such as the median, mean, and maximum. The bound is stated in terms of +the fat-shattering dimensions of the component classes. For linear and affine +function classes, we provide a considerably sharper upper bound and a matching +lower bound, achieving, in particular, an optimal dependence on $k$. Along the +way, we improve several known results in addition to pointing out and +correcting a number of erroneous claims in the literature. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ SpeechMirror: A Multimodal Visual Analytics System for Personalized + Reflection of Online Public Speaking Effectiveness + + +
+ As communications are increasingly taking place virtually, the ability to +present well online is becoming an indispensable skill. Online speakers are +facing unique challenges in engaging with remote audiences. However, there has +been a lack of evidence-based analytical systems for people to comprehensively +evaluate online speeches and further discover possibilities for improvement. +This paper introduces SpeechMirror, a visual analytics system facilitating +reflection on a speech based on insights from a collection of online speeches. +The system estimates the impact of different speech techniques on effectiveness +and applies them to a speech to give users awareness of the performance of +speech techniques. A similarity recommendation approach based on speech factors +or script content supports guided exploration to expand knowledge of +presentation evidence and accelerate the discovery of speech delivery +possibilities. SpeechMirror provides intuitive visualizations and interactions +for users to understand speech factors. Among them, SpeechTwin, a novel +multimodal visual summary of speech, supports rapid understanding of critical +speech factors and comparison of different speech samples, and SpeechPlayer +augments the speech video by integrating visualization of the speaker's body +language with interaction, for focused analysis. The system utilizes +visualizations suited to the distinct nature of different speech factors for +user comprehension. The proposed system and visualization techniques were +evaluated with domain experts and amateurs, demonstrating usability for users +with low visualization literacy and its efficacy in assisting users to develop +insights for potential improvement. + +
+
+ comment: Main paper (11 pages, 6 figures) and Supplemental document (11 pages, + 11 figures). Accepted by VIS 2023 +
+
+
+
+
+ + ☆ Multimodal Fish Feeding Intensity Assessment in Aquaculture + + +
+ Fish feeding intensity assessment (FFIA) aims to evaluate the intensity +change of fish appetite during the feeding process, which is vital in +industrial aquaculture applications. The main challenges surrounding FFIA are +two-fold. 1) robustness: existing work has mainly leveraged single-modality +(e.g., vision, audio) methods, which have a high sensitivity to input noise. 2) +efficiency: FFIA models are generally expected to be employed on devices. This +presents a challenge in terms of computational efficiency. In this work, we +first introduce an audio-visual dataset, called AV-FFIA. AV-FFIA consists of +27,000 labeled audio and video clips that capture different levels of fish +feeding intensity. To our knowledge, AV-FFIA is the first large-scale +multimodal dataset for FFIA research. Then, we introduce a multi-modal approach +for FFIA by leveraging single-modality pre-trained models and modality-fusion +methods, with benchmark studies on AV-FFIA. Our experimental results indicate +that the multi-modal approach substantially outperforms the single-modality +based approach, especially in noisy environments. While multimodal approaches +provide a performance gain for FFIA, it inherently increase the computational +cost. To overcome this issue, we further present a novel unified model, termed +as U-FFIA. U-FFIA is a single model capable of processing audio, visual, or +audio-visual modalities, by leveraging modality dropout during training and +knowledge distillation from single-modality pre-trained models. We demonstrate +that U-FFIA can achieve performance better than or on par with the +state-of-the-art modality-specific FFIA models, with significantly lower +computational overhead. Our proposed U-FFIA approach enables a more robust and +efficient method for FFIA, with the potential to contribute to improved +management practices and sustainability in aquaculture. + +
+
+
+
+
+ + ☆ Spatial Perceptual Quality Aware Adaptive Volumetric Video Streaming + + +
+ Volumetric video offers a highly immersive viewing experience, but poses +challenges in ensuring quality of experience (QoE) due to its high bandwidth +requirements. In this paper, we explore the effect of viewing distance +introduced by six degrees of freedom (6DoF) spatial navigation on user's +perceived quality. By considering human visual resolution limitations, we +propose a visual acuity model that describes the relationship between the +virtual viewing distance and the tolerable boundary point cloud density. The +proposed model satisfies spatial visual requirements during 6DoF exploration. +Additionally, it dynamically adjusts quality levels to balance perceptual +quality and bandwidth consumption. Furthermore, we present a QoE model to +represent user's perceived quality at different viewing distances precisely. +Extensive experimental results demonstrate that, the proposed scheme can +effectively improve the overall average QoE by up to 26% over real networks and +user traces, compared to existing baselines. + +
+
+ comment: Accepted byIEEE Globecom 2023 +
+
+
+
+
+ + ♻ ☆ Symbolic Music Representations for Classification Tasks: A Systematic + Evaluation + + +
+ Music Information Retrieval (MIR) has seen a recent surge in deep +learning-based approaches, which often involve encoding symbolic music (i.e., +music represented in terms of discrete note events) in an image-like or +language like fashion. However, symbolic music is neither an image nor a +sentence, and research in the symbolic domain lacks a comprehensive overview of +the different available representations. In this paper, we investigate matrix +(piano roll), sequence, and graph representations and their corresponding +neural architectures, in combination with symbolic scores and performances on +three piece-level classification tasks. We also introduce a novel graph +representation for symbolic performances and explore the capability of graph +representations in global classification tasks. Our systematic evaluation shows +advantages and limitations of each input representation. Our results suggest +that the graph representation, as the newest and least explored among the three +approaches, exhibits promising performance, while being more light-weight in +training. + +
+
+ comment: To be published in the Proceedings of the 24th International Society + for Music Information Retrieval Conference (ISMIR 2023), Milan, Italy +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 26 + +
+
+
+ + ☆ Distributional Data Augmentation Methods for Low Resource Language AAAI 2023 + + +
+ Text augmentation is a technique for constructing synthetic data from an +under-resourced corpus to improve predictive performance. Synthetic data +generation is common in numerous domains. However, recently text augmentation +has emerged in natural language processing (NLP) to improve downstream tasks. +One of the current state-of-the-art text augmentation techniques is easy data +augmentation (EDA), which augments the training data by injecting and replacing +synonyms and randomly permuting sentences. One major obstacle with EDA is the +need for versatile and complete synonym dictionaries, which cannot be easily +found in low-resource languages. To improve the utility of EDA, we propose two +extensions, easy distributional data augmentation (EDDA) and type specific +similar word replacement (TSSR), which uses semantic word context information +and part-of-speech tags for word replacement and augmentation. In an extensive +empirical evaluation, we show the utility of the proposed methods, measured by +F1 score, on two representative datasets in Swedish as an example of a +low-resource language. With the proposed methods, we show that augmented data +improve classification performances in low-resource settings. + +
+
+ comment: AAAI 2023 Workshop on Knowledge Augmented Methods for NLP +
+
+
+
+
+ + ☆ Reverse-Engineering Decoding Strategies Given Blackbox Access to a + Language Generation System + + +
+ Neural language models are increasingly deployed into APIs and websites that +allow a user to pass in a prompt and receive generated text. Many of these +systems do not reveal generation parameters. In this paper, we present methods +to reverse-engineer the decoding method used to generate text (i.e., top-$k$ or +nucleus sampling). Our ability to discover which decoding strategy was used has +implications for detecting generated text. Additionally, the process of +discovering the decoding strategy can reveal biases caused by selecting +decoding settings which severely truncate a model's predicted distributions. We +perform our attack on several families of open-source language models, as well +as on production systems (e.g., ChatGPT). + +
+
+ comment: 6 pages, 4 figures, 3 tables. Also, 5 page appendix. Accepted to INLG + 2023 +
+
+
+
+
+ + ☆ Speech Emotion Recognition with Distilled Prosodic and Linguistic Affect + Representations + + +
+ We propose EmoDistill, a novel speech emotion recognition (SER) framework +that leverages cross-modal knowledge distillation during training to learn +strong linguistic and prosodic representations of emotion from speech. During +inference, our method only uses a stream of speech signals to perform unimodal +SER thus reducing computation overhead and avoiding run-time transcription and +prosodic feature extraction errors. During training, our method distills +information at both embedding and logit levels from a pair of pre-trained +Prosodic and Linguistic teachers that are fine-tuned for SER. Experiments on +the IEMOCAP benchmark demonstrate that our method outperforms other unimodal +and multimodal techniques by a considerable margin, and achieves +state-of-the-art performance of 77.49% unweighted accuracy and 78.91% weighted +accuracy. Detailed ablation studies demonstrate the impact of each component of +our method. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Leveraging Large Language Models for Exploiting ASR Uncertainty + + +
+ While large language models excel in a variety of natural language processing +(NLP) tasks, to perform well on spoken language understanding (SLU) tasks, they +must either rely on off-the-shelf automatic speech recognition (ASR) systems +for transcription, or be equipped with an in-built speech modality. This work +focuses on the former scenario, where LLM's accuracy on SLU tasks is +constrained by the accuracy of a fixed ASR system on the spoken input. +Specifically, we tackle speech-intent classification task, where a high +word-error-rate can limit the LLM's ability to understand the spoken intent. +Instead of chasing a high accuracy by designing complex or specialized +architectures regardless of deployment costs, we seek to answer how far we can +go without substantially changing the underlying ASR and LLM, which can +potentially be shared by multiple unrelated tasks. To this end, we propose +prompting the LLM with an n-best list of ASR hypotheses instead of only the +error-prone 1-best hypothesis. We explore prompt-engineering to explain the +concept of n-best lists to the LLM; followed by the finetuning of Low-Rank +Adapters on the downstream tasks. Our approach using n-best lists proves to be +effective on a device-directed speech detection task as well as on a keyword +spotting task, where systems using n-best list prompts outperform those using +1-best ASR hypothesis; thus paving the way for an efficient method to exploit +ASR uncertainty via LLMs for speech-based applications. + +
+
+
+
+
+ + ☆ Neurons in Large Language Models: Dead, N-gram, Positional + + +
+ We analyze a family of large language models in such a lightweight manner +that can be done on a single GPU. Specifically, we focus on the OPT family of +models ranging from 125m to 66b parameters and rely only on whether an FFN +neuron is activated or not. First, we find that the early part of the network +is sparse and represents many discrete features. Here, many neurons (more than +70% in some layers of the 66b model) are "dead", i.e. they never activate on a +large collection of diverse data. At the same time, many of the alive neurons +are reserved for discrete features and act as token and n-gram detectors. +Interestingly, their corresponding FFN updates not only promote next token +candidates as could be expected, but also explicitly focus on removing the +information about triggering them tokens, i.e., current input. To the best of +our knowledge, this is the first example of mechanisms specialized at removing +(rather than adding) information from the residual stream. With scale, models +become more sparse in a sense that they have more dead neurons and token +detectors. Finally, some neurons are positional: them being activated or not +depends largely (or solely) on position and less so (or not at all) on textual +data. We find that smaller models have sets of neurons acting as position range +indicators while larger models operate in a less explicit manner. + +
+
+
+
+
+ + ☆ FaNS: a Facet-based Narrative Similarity Metric + + +
+ Similar Narrative Retrieval is a crucial task since narratives are essential +for explaining and understanding events, and multiple related narratives often +help to create a holistic view of the event of interest. To accurately identify +semantically similar narratives, this paper proposes a novel narrative +similarity metric called Facet-based Narrative Similarity (FaNS), based on the +classic 5W1H facets (Who, What, When, Where, Why, and How), which are extracted +by leveraging the state-of-the-art Large Language Models (LLMs). Unlike +existing similarity metrics that only focus on overall lexical/semantic match, +FaNS provides a more granular matching along six different facets independently +and then combines them. To evaluate FaNS, we created a comprehensive dataset by +collecting narratives from AllSides, a third-party news portal. Experimental +results demonstrate that the FaNS metric exhibits a higher correlation (37\% +higher) than traditional text similarity metrics that directly measure the +lexical/semantic match between narratives, demonstrating its effectiveness in +comparing the finer details between a pair of narratives. + +
+
+
+
+
+ + ☆ MMHQA-ICL: Multimodal In-context Learning for Hybrid Question Answering + over Text, Tables and Images + + +
+ In the real world, knowledge often exists in a multimodal and heterogeneous +form. Addressing the task of question answering with hybrid data types, +including text, tables, and images, is a challenging task (MMHQA). Recently, +with the rise of large language models (LLM), in-context learning (ICL) has +become the most popular way to solve QA problems. We propose MMHQA-ICL +framework for addressing this problems, which includes stronger heterogeneous +data retriever and an image caption module. Most importantly, we propose a +Type-specific In-context Learning Strategy for MMHQA, enabling LLMs to leverage +their powerful performance in this task. We are the first to use end-to-end LLM +prompting method for this task. Experimental results demonstrate that our +framework outperforms all baselines and methods trained on the full dataset, +achieving state-of-the-art results under the few-shot setting on the +MultimodalQA dataset. + +
+
+
+
+
+ + ☆ SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment + to Cultural Reasoning + + +
+ We present SeaEval, a benchmark for multilingual foundation models. In +addition to characterizing how these models understand and reason with natural +language, we also investigate how well they comprehend cultural practices, +nuances, and values. Alongside standard accuracy metrics, we investigate the +brittleness of foundation models in the dimensions of semantics and +multilinguality. Our analyses span both open-sourced and closed models, leading +to empirical results across classic NLP tasks, reasoning, and cultural +comprehension. Key findings indicate (1) Most models exhibit varied behavior +when given paraphrased instructions. (2) Many models still suffer from exposure +bias (e.g., positional bias, majority label bias). (3) For questions rooted in +factual, scientific, and commonsense knowledge, consistent responses are +expected across multilingual queries that are semantically equivalent. Yet, +most models surprisingly demonstrate inconsistent performance on these queries. +(4) Multilingually-trained models have not attained "balanced multilingual" +capabilities. Our endeavors underscore the need for more generalizable semantic +representations and enhanced multilingual contextualization. SeaEval can serve +as a launchpad for more thorough investigations and evaluations for +multilingual and multicultural scenarios. + +
+
+ comment: 15 pages, 7 figures +
+
+
+
+
+ + ☆ Data Augmentation for Conversational AI + + +
+ Advancements in conversational systems have revolutionized information +access, surpassing the limitations of single queries. However, developing +dialogue systems requires a large amount of training data, which is a challenge +in low-resource domains and languages. Traditional data collection methods like +crowd-sourcing are labor-intensive and time-consuming, making them ineffective +in this context. Data augmentation (DA) is an affective approach to alleviate +the data scarcity problem in conversational systems. This tutorial provides a +comprehensive and up-to-date overview of DA approaches in the context of +conversational systems. It highlights recent advances in conversation +augmentation, open domain and task-oriented conversation generation, and +different paradigms of evaluating these models. We also discuss current +challenges and future directions in order to help researchers and practitioners +to further advance the field in this area. + +
+
+
+
+
+ + ☆ Towards Better Multi-modal Keyphrase Generation via Visual Entity + Enhancement and Multi-granularity Image Noise Filtering + + +
+ Multi-modal keyphrase generation aims to produce a set of keyphrases that +represent the core points of the input text-image pair. In this regard, +dominant methods mainly focus on multi-modal fusion for keyphrase generation. +Nevertheless, there are still two main drawbacks: 1) only a limited number of +sources, such as image captions, can be utilized to provide auxiliary +information. However, they may not be sufficient for the subsequent keyphrase +generation. 2) the input text and image are often not perfectly matched, and +thus the image may introduce noise into the model. To address these +limitations, in this paper, we propose a novel multi-modal keyphrase generation +model, which not only enriches the model input with external knowledge, but +also effectively filters image noise. First, we introduce external visual +entities of the image as the supplementary input to the model, which benefits +the cross-modal semantic alignment for keyphrase generation. Second, we +simultaneously calculate an image-text matching score and image region-text +correlation scores to perform multi-granularity image noise filtering. +Particularly, we introduce the correlation scores between image regions and +ground-truth keyphrases to refine the calculation of the previously-mentioned +correlation scores. To demonstrate the effectiveness of our model, we conduct +several groups of experiments on the benchmark dataset. + Experimental results and in-depth analyses show that our model achieves the +state-of-the-art performance. Our code is available on +https://github.com/DeepLearnXMU/MM-MKP. + +
+
+ comment: Accepted In Proceedings of the 31st ACM International Conference on + Multimedia (MM' 23) +
+
+
+
+
+ + ☆ EPA: Easy Prompt Augmentation on Large Language Models via Multiple + Sources and Multiple Targets + + +
+ Large language models (LLMs) have shown promising performance on various NLP +tasks via task prompting. And their performance can be further improved by +appending task demonstrations to the head of the prompt. And usually, a better +performance can be achieved with more demonstrations. However, asking the users +to write the demonstrations can be cumbersome. As a simple yet cost-effective +workaround, this paper proposes a novel method called EPA (\textbf{E}asy +\textbf{P}rompt \textbf{A}ugmentation)\footnote{While this paper considers +augmenting prompts via demonstrations, we name it EPA as the name EDA is +already taken by a well-known NLP method \citep{wei-zou-2019-eda}.} that +effectively minimizes user efforts in writing demonstrations while improving +the model performance at the same time. EPA achieves these goals by +automatically augmenting the demonstrations with multiple sources/targets, +where each of them paraphrases each other. This is well motivated as augmenting +data via paraphrasing effectively improves neural language models. EPA thus +employs paraphrasing as an augmentation method for in-context learning. +Extensive experiments indicate that EPA effectively improves both NLU and NLG +tasks, covering from natural language inference to machine translation in +translating tens of languages.\footnote{Code and data will be released upon +publication.} + +
+
+
+
+
+ + ☆ Toward Reproducing Network Research Results Using Large Language Models + + +
+ Reproducing research results in the networking community is important for +both academia and industry. The current best practice typically resorts to +three approaches: (1) looking for publicly available prototypes; (2) contacting +the authors to get a private prototype; and (3) manually implementing a +prototype following the description of the publication. However, most published +network research does not have public prototypes and private prototypes are +hard to get. As such, most reproducing efforts are spent on manual +implementation based on the publications, which is both time and labor +consuming and error-prone. In this paper, we boldly propose reproducing network +research results using the emerging large language models (LLMs). In +particular, we first prove its feasibility with a small-scale experiment, in +which four students with essential networking knowledge each reproduces a +different networking system published in prominent conferences and journals by +prompt engineering ChatGPT. We report the experiment's observations and lessons +and discuss future open research questions of this proposal. This work raises +no ethical issue. + +
+
+
+
+
+ + ☆ Analysis of Disinformation and Fake News Detection Using Fine-Tuned + Large Language Model + + +
+ The paper considers the possibility of fine-tuning Llama 2 large language +model (LLM) for the disinformation analysis and fake news detection. For +fine-tuning, the PEFT/LoRA based approach was used. In the study, the model was +fine-tuned for the following tasks: analysing a text on revealing +disinformation and propaganda narratives, fact checking, fake news detection, +manipulation analytics, extracting named entities with their sentiments. The +obtained results show that the fine-tuned Llama 2 model can perform a deep +analysis of texts and reveal complex styles and narratives. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models. + +
+
+
+
+
+ + ☆ Code-Style In-Context Learning for Knowledge-Based Question Answering + + +
+ Current methods for Knowledge-Based Question Answering (KBQA) usually rely on +complex training techniques and model frameworks, leading to many limitations +in practical applications. Recently, the emergence of In-Context Learning (ICL) +capabilities in Large Language Models (LLMs) provides a simple and +training-free semantic parsing paradigm for KBQA: Given a small number of +questions and their labeled logical forms as demo examples, LLMs can understand +the task intent and generate the logic form for a new question. However, +current powerful LLMs have little exposure to logic forms during pre-training, +resulting in a high format error rate. To solve this problem, we propose a +code-style in-context learning method for KBQA, which converts the generation +process of unfamiliar logical form into the more familiar code generation +process for LLMs. Experimental results on three mainstream datasets show that +our method dramatically mitigated the formatting error problem in generating +logic forms while realizing a new SOTA on WebQSP, GrailQA, and GraphQ under the +few-shot setting. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Embedding structure matters: Comparing methods to adapt multilingual + vocabularies to new languages + + +
+ Pre-trained multilingual language models underpin a large portion of modern +NLP tools outside of English. A strong baseline for specializing these models +for specific languages is Language-Adaptive Pre-Training (LAPT). However, +retaining a large cross-lingual vocabulary and embedding matrix comes at +considerable excess computational cost during adaptation. In this study, we +propose several simple techniques to replace a cross-lingual vocabulary with a +compact, language-specific one. Namely, we address strategies for +re-initializing the token embedding matrix after vocabulary specialization. We +then provide a systematic experimental comparison of our techniques, in +addition to the recently-proposed Focus method. We demonstrate that: 1) +Embedding-replacement techniques in the monolingual transfer literature are +inadequate for adapting multilingual models. 2) Replacing cross-lingual +vocabularies with smaller specialized ones provides an efficient method to +improve performance in low-resource languages. 3) Simple embedding +re-initialization techniques based on script-wise sub-distributions rival +techniques such as Focus, which rely on similarity scores obtained from an +auxiliary model. + +
+
+
+
+
+ + ☆ FIAT: Fusing learning paradigms with Instruction-Accelerated Tuning + + +
+ Learning paradigms for large language models (LLMs) currently tend to fall +within either in-context learning (ICL) or full fine-tuning. Each of these +comes with their own trade-offs based on available data, model size, compute +cost, ease-of-use, and final quality with neither solution performing well +across-the-board. In this article, we first describe ICL and fine-tuning +paradigms in a way that highlights their natural connections. Based on these +connections, we propose a new learning paradigm called FIAT that fuses the best +of these paradigms together, enabling prompt-engineered instructions and +chain-of-thought reasoning with the very largest models while also using +similar methods to perform parameter updates on a modestly-sized LLM with +parameter-efficient tuning. We evaluate FIAT's effectiveness on a variety of +multilingual tasks and observe that FIAT performs better than both ICL and +fine-tuning at scales ranging from 100-10,000 training examples. We hope that +FIAT provides a practical way of harnessing the full potential of LLMs without +needing to make a hard choice between learning paradigms. + +
+
+
+
+
+ + ☆ MADLAD-400: A Multilingual And Document-Level Large Audited Dataset + + +
+ We introduce MADLAD-400, a manually audited, general domain 3T token +monolingual dataset based on CommonCrawl, spanning 419 languages. We discuss +the limitations revealed by self-auditing MADLAD-400, and the role data +auditing had in the dataset creation process. We then train and release a +10.7B-parameter multilingual machine translation model on 250 billion tokens +covering over 450 languages using publicly available data, and find that it is +competitive with models that are significantly larger, and report the results +on different domains. In addition, we train a 8B-parameter language model, and +assess the results on few-shot translation. We make the baseline models +available to the research community. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Exploring Large Language Models for Communication Games: An Empirical + Study on Werewolf + + +
+ Communication games, which we refer to as incomplete information games that +heavily depend on natural language communication, hold significant research +value in fields such as economics, social science, and artificial intelligence. +In this work, we explore the problem of how to engage large language models +(LLMs) in communication games, and in response, propose a tuning-free +framework. Our approach keeps LLMs frozen, and relies on the retrieval and +reflection on past communications and experiences for improvement. An empirical +study on the representative and widely-studied communication game, +``Werewolf'', demonstrates that our framework can effectively play Werewolf +game without tuning the parameters of the LLMs. More importantly, strategic +behaviors begin to emerge in our experiments, suggesting that it will be a +fruitful journey to engage LLMs in communication games and associated domains. + +
+
+ comment: 23 pages, 5 figures and 4 tables +
+
+
+
+
+ + ☆ Efficient Finetuning Large Language Models For Vietnamese Chatbot + + +
+ Large language models (LLMs), such as GPT-4, PaLM, and LLaMa, have been shown +to achieve remarkable performance across a variety of natural language tasks. +Recent advancements in instruction tuning bring LLMs with ability in following +user's instructions and producing human-like responses. However, the high costs +associated with training and implementing LLMs pose challenges to academic +research. Furthermore, the availability of pretrained LLMs and instruction-tune +datasets for Vietnamese language is limited. To tackle these concerns, we +leverage large-scale instruction-following datasets from open-source projects, +namely Alpaca, GPT4All, and Chat-Doctor, which cover general domain and +specific medical domain. To the best of our knowledge, these are the first +instructional dataset for Vietnamese. Subsequently, we utilize +parameter-efficient tuning through Low-Rank Adaptation (LoRA) on two open LLMs: +Bloomz (Multilingual) and GPTJ-6B (Vietnamese), resulting four models: +Bloomz-Chat, Bloomz-Doctor, GPTJ-Chat, GPTJ-Doctor.Finally, we assess the +effectiveness of our methodology on a per-sample basis, taking into +consideration the helpfulness, relevance, accuracy, level of detail in their +responses. This evaluation process entails the utilization of GPT-4 as an +automated scoring mechanism. Despite utilizing a low-cost setup, our method +demonstrates about 20-30\% improvement over the original models in our +evaluation tasks. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.08177, + arXiv:2303.16199 by other authors +
+
+
+
+
+ + ♻ ☆ LLMatic: Neural Architecture Search via Large Language Models and + Quality Diversity Optimization + + +
+ Large Language Models (LLMs) have emerged as powerful tools capable of +accomplishing a broad spectrum of tasks. Their abilities span numerous areas, +and one area where they have made a significant impact is in the domain of code +generation. In this context, we view LLMs as mutation and crossover tools. +Meanwhile, Quality-Diversity (QD) algorithms are known to discover diverse and +robust solutions. By merging the code-generating abilities of LLMs with the +diversity and robustness of QD solutions, we introduce LLMatic, a Neural +Architecture Search (NAS) algorithm. While LLMs struggle to conduct NAS +directly through prompts, LLMatic uses a procedural approach, leveraging QD for +prompts and network architecture to create diverse and highly performant +networks. We test LLMatic on the CIFAR-10 image classification benchmark, +demonstrating that it can produce competitive networks with just $2,000$ +searches, even without prior knowledge of the benchmark domain or exposure to +any previous top-performing models for the benchmark. + +
+
+
+
+
+ + ♻ ☆ Large Language Models Can be Lazy Learners: Analyze Shortcuts in + In-Context Learning + + +
+ Large language models (LLMs) have recently shown great potential for +in-context learning, where LLMs learn a new task simply by conditioning on a +few input-label pairs (prompts). Despite their potential, our understanding of +the factors influencing end-task performance and the robustness of in-context +learning remains limited. This paper aims to bridge this knowledge gap by +investigating the reliance of LLMs on shortcuts or spurious correlations within +prompts. Through comprehensive experiments on classification and extraction +tasks, we reveal that LLMs are "lazy learners" that tend to exploit shortcuts +in prompts for downstream tasks. Additionally, we uncover a surprising finding +that larger models are more likely to utilize shortcuts in prompts during +inference. Our findings provide a new perspective on evaluating robustness in +in-context learning and pose new challenges for detecting and mitigating the +use of shortcuts in prompts. + +
+
+
+
+
+ + ♻ ☆ Deep Emotion Recognition in Textual Conversations: A Survey + + +
+ While Emotion Recognition in Conversations (ERC) has seen a tremendous +advancement in the last few years, new applications and implementation +scenarios present novel challenges and opportunities. These range from +leveraging the conversational context, speaker and emotion dynamics modelling, +to interpreting common sense expressions, informal language and sarcasm, +addressing challenges of real time ERC, recognizing emotion causes, different +taxonomies across datasets, multilingual ERC to interpretability. This survey +starts by introducing ERC, elaborating on the challenges and opportunities +pertaining to this task. It proceeds with a description of the emotion +taxonomies and a variety of ERC benchmark datasets employing such taxonomies. +This is followed by descriptions of the most prominent works in ERC with +explanations of the Deep Learning architectures employed. Then, it provides +advisable ERC practices towards better frameworks, elaborating on methods to +deal with subjectivity in annotations and modelling and methods to deal with +the typically unbalanced ERC datasets. Finally, it presents systematic review +tables comparing several works regarding the methods used and their +performance. The survey highlights the advantage of leveraging techniques to +address unbalanced data, the exploration of mixed emotions and the benefits of +incorporating annotation subjectivity in the learning phase. + +
+
+
+
+
+ + ♻ ☆ Can ChatGPT Forecast Stock Price Movements? Return Predictability and + Large Language Models + + +
+ We examine the potential of ChatGPT and other large language models in +predicting stock market returns using news headlines. We use ChatGPT to assess +whether each headline is good, bad, or neutral for firms' stock prices. We +document a significantly positive correlation between ChatGPT scores and +subsequent daily stock returns. We find that ChatGPT outperforms traditional +sentiment analysis methods. More basic models such as GPT-1, GPT-2, and BERT +cannot accurately forecast returns, indicating return predictability is an +emerging capacity of complex language models. Long-short strategies based on +ChatGPT-4 deliver the highest Sharpe ratio. Furthermore, we find predictability +in both small and large stocks, suggesting market underreaction to company +news. Predictability is stronger among smaller stocks and stocks with bad news, +consistent with limits-to-arbitrage also playing an important role. Finally, we +propose a new method to evaluate and understand the models' reasoning +capabilities. Overall, our results suggest that incorporating advanced language +models into the investment decision-making process can yield more accurate +predictions and enhance the performance of quantitative trading strategies. + +
+
+ comment: Previously posted in SSRN + https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4412788 +
+
+
+
+
+ + ♻ ☆ Predicting Word Learning in Children from the Performance of Computer + Vision Systems + + +
+ For human children as well as machine learning systems, a key challenge in +learning a word is linking the word to the visual phenomena it describes. We +explore this aspect of word learning by using the performance of computer +vision systems as a proxy for the difficulty of learning a word from visual +cues. We show that the age at which children acquire different categories of +words is correlated with the performance of visual classification and +captioning systems, over and above the expected effects of word frequency. The +performance of the computer vision systems is correlated with human judgments +of the concreteness of words, which are in turn a predictor of children's word +learning, suggesting that these models are capturing the relationship between +words and visual phenomena. + +
+
+ comment: CogSci 2023 +
+
+
+
+
+ + ♻ ☆ Towards Trustworthy Explanation: On Causal Rationalization ICML + + +
+ With recent advances in natural language processing, rationalization becomes +an essential self-explaining diagram to disentangle the black box by selecting +a subset of input texts to account for the major variation in prediction. Yet, +existing association-based approaches on rationalization cannot identify true +rationales when two or more snippets are highly inter-correlated and thus +provide a similar contribution to prediction accuracy, so-called spuriousness. +To address this limitation, we novelly leverage two causal desiderata, +non-spuriousness and efficiency, into rationalization from the causal inference +perspective. We formally define a series of probabilities of causation based on +a newly proposed structural causal model of rationalization, with its +theoretical identification established as the main component of learning +necessary and sufficient rationales. The superior performance of the proposed +causal rationalization is demonstrated on real-world review and medical +datasets with extensive experiments compared to state-of-the-art methods. + +
+
+ comment: In Proceedings of the 40th International Conference on Machine + Learning (ICML) GitHub Repository: + https://github.com/onepounchman/Causal-Retionalization +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models on Graphs: Performance Insights and + Comparative Analysis + + +
+ Large Language Models (LLMs) have garnered considerable interest within both +academic and industrial. Yet, the application of LLMs to graph data remains +under-explored. In this study, we evaluate the capabilities of four LLMs in +addressing several analytical problems with graph data. We employ four distinct +evaluation metrics: Comprehension, Correctness, Fidelity, and Rectification. +Our results show that: 1) LLMs effectively comprehend graph data in natural +language and reason with graph topology. 2) GPT models can generate logical and +coherent results, outperforming alternatives in correctness. 3) All examined +LLMs face challenges in structural reasoning, with techniques like zero-shot +chain-of-thought and few-shot prompting showing diminished efficacy. 4) GPT +models often produce erroneous answers in multi-answer tasks, raising concerns +in fidelity. 5) GPT models exhibit elevated confidence in their outputs, +potentially hindering their rectification capacities. Notably, GPT-4 has +demonstrated the capacity to rectify responses from GPT-3.5-turbo and its own +previous iterations. The code is available at: +https://github.com/Ayame1006/LLMtoGraph. + +
+
+ comment: 12 pages, 1 figure +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 5 + +
+
+
+ + ☆ How to Evaluate Semantic Communications for Images with ViTScore Metric? + + +
+ Semantic communications (SC) have been expected to be a new paradigm shifting +to catalyze the next generation communication, whose main concerns shift from +accurate bit transmission to effective semantic information exchange in +communications. However, the previous and widely-used metrics for images are +not applicable to evaluate the image semantic similarity in SC. Classical +metrics to measure the similarity between two images usually rely on the pixel +level or the structural level, such as the PSNR and the MS-SSIM. +Straightforwardly using some tailored metrics based on deep-learning methods in +CV community, such as the LPIPS, is infeasible for SC. To tackle this, inspired +by BERTScore in NLP community, we propose a novel metric for evaluating image +semantic similarity, named Vision Transformer Score (ViTScore). We prove +theoretically that ViTScore has 3 important properties, including symmetry, +boundedness, and normalization, which make ViTScore convenient and intuitive +for image measurement. To evaluate the performance of ViTScore, we compare +ViTScore with 3 typical metrics (PSNR, MS-SSIM, and LPIPS) through 5 classes of +experiments. Experimental results demonstrate that ViTScore can better evaluate +the image semantic similarity than the other 3 typical metrics, which indicates +that ViTScore is an effective performance metric when deployed in SC scenarios. + +
+
+
+
+
+ + ☆ Semi-supervised Instance Segmentation with a Learned Shape Prior + + +
+ To date, most instance segmentation approaches are based on supervised +learning that requires a considerable amount of annotated object contours as +training ground truth. Here, we propose a framework that searches for the +target object based on a shape prior. The shape prior model is learned with a +variational autoencoder that requires only a very limited amount of training +data: In our experiments, a few dozens of object shape patches from the target +dataset, as well as purely synthetic shapes, were sufficient to achieve results +en par with supervised methods with full access to training data on two out of +three cell segmentation datasets. Our method with a synthetic shape prior was +superior to pre-trained supervised models with access to limited +domain-specific training data on all three datasets. Since the learning of +prior models requires shape patches, whether real or synthetic data, we call +this framework semi-supervised learning. + +
+
+
+
+
+ + ☆ SortedAP: Rethinking evaluation metrics for instance segmentation + + +
+ Designing metrics for evaluating instance segmentation revolves around +comprehensively considering object detection and segmentation accuracy. +However, other important properties, such as sensitivity, continuity, and +equality, are overlooked in the current study. In this paper, we reveal that +most existing metrics have a limited resolution of segmentation quality. They +are only conditionally sensitive to the change of masks or false predictions. +For certain metrics, the score can change drastically in a narrow range which +could provide a misleading indication of the quality gap between results. +Therefore, we propose a new metric called sortedAP, which strictly decreases +with both object- and pixel-level imperfections and has an uninterrupted +penalization scale over the entire domain. We provide the evaluation toolkit +and experiment code at https://www.github.com/looooongChen/sortedAP. + +
+
+
+
+
+ + ☆ AnyPose: Anytime 3D Human Pose Forecasting via Neural Ordinary + Differential Equations + + +
+ Anytime 3D human pose forecasting is crucial to synchronous real-world +human-machine interaction, where the term ``anytime" corresponds to predicting +human pose at any real-valued time step. However, to the best of our knowledge, +all the existing methods in human pose forecasting perform predictions at +preset, discrete time intervals. Therefore, we introduce AnyPose, a lightweight +continuous-time neural architecture that models human behavior dynamics with +neural ordinary differential equations. We validate our framework on the +Human3.6M, AMASS, and 3DPW dataset and conduct a series of comprehensive +analyses towards comparison with existing methods and the intersection of human +pose and neural ordinary differential equations. Our results demonstrate that +AnyPose exhibits high-performance accuracy in predicting future poses and takes +significantly lower computational time than traditional methods in solving +anytime prediction tasks. + +
+
+
+
+
+ + ♻ ☆ Unifying Synergies between Self-supervised Learning and Dynamic + Computation BMVC 2023 + + +
+ Computationally expensive training strategies make self-supervised learning +(SSL) impractical for resource constrained industrial settings. Techniques like +knowledge distillation (KD), dynamic computation (DC), and pruning are often +used to obtain a lightweightmodel, which usually involves multiple epochs of +fine-tuning (or distilling steps) of a large pre-trained model, making it more +computationally challenging. In this work we present a novel perspective on the +interplay between SSL and DC paradigms. In particular, we show that it is +feasible to simultaneously learn a dense and gated sub-network from scratch in +a SSL setting without any additional fine-tuning or pruning steps. The +co-evolution during pre-training of both dense and gated encoder offers a good +accuracy-efficiency trade-off and therefore yields a generic and multi-purpose +architecture for application specific industrial settings. Extensive +experiments on several image classification benchmarks including CIFAR-10/100, +STL-10 and ImageNet-100, demonstrate that the proposed training strategy +provides a dense and corresponding gated sub-network that achieves on-par +performance compared with the vanilla self-supervised setting, but at a +significant reduction in computation in terms of FLOPs, under a range of target +budgets (td ). + +
+
+ comment: Accepted in BMVC 2023 +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ RecAD: Towards A Unified Library for Recommender Attack and Defense + + +
+ In recent years, recommender systems have become a ubiquitous part of our +daily lives, while they suffer from a high risk of being attacked due to the +growing commercial and social values. Despite significant research progress in +recommender attack and defense, there is a lack of a widely-recognized +benchmarking standard in the field, leading to unfair performance comparison +and limited credibility of experiments. To address this, we propose RecAD, a +unified library aiming at establishing an open benchmark for recommender attack +and defense. RecAD takes an initial step to set up a unified benchmarking +pipeline for reproducible research by integrating diverse datasets, standard +source codes, hyper-parameter settings, running logs, attack knowledge, attack +budget, and evaluation results. The benchmark is designed to be comprehensive +and sustainable, covering both attack, defense, and evaluation tasks, enabling +more researchers to easily follow and contribute to this promising field. RecAD +will drive more solid and reproducible research on recommender systems attack +and defense, reduce the redundant efforts of researchers, and ultimately +increase the credibility and practical value of recommender attack and defense. +The project is released at https://github.com/gusye1234/recad. + +
+
+
+
+
+ + ☆ Exploring Music Genre Classification: Algorithm Analysis and Deployment + Architecture + + +
+ Music genre classification has become increasingly critical with the advent +of various streaming applications. Nowadays, we find it impossible to imagine +using the artist's name and song title to search for music in a sophisticated +music app. It is always difficult to classify music correctly because the +information linked to music, such as region, artist, album, or non-album, is so +variable. This paper presents a study on music genre classification using a +combination of Digital Signal Processing (DSP) and Deep Learning (DL) +techniques. A novel algorithm is proposed that utilizes both DSP and DL methods +to extract relevant features from audio signals and classify them into various +genres. The algorithm was tested on the GTZAN dataset and achieved high +accuracy. An end-to-end deployment architecture is also proposed for +integration into music-related applications. The performance of the algorithm +is analyzed and future directions for improvement are discussed. The proposed +DSP and DL-based music genre classification algorithm and deployment +architecture demonstrate a promising approach for music genre classification. + +
+
+
+
+
+ + ☆ CPMR: Context-Aware Incremental Sequential Recommendation with + Pseudo-Multi-Task Learning CIKM 2023 + + +
+ The motivations of users to make interactions can be divided into static +preference and dynamic interest. To accurately model user representations over +time, recent studies in sequential recommendation utilize information +propagation and evolution to mine from batches of arriving interactions. +However, they ignore the fact that people are easily influenced by the recent +actions of other users in the contextual scenario, and applying evolution +across all historical interactions dilutes the importance of recent ones, thus +failing to model the evolution of dynamic interest accurately. To address this +issue, we propose a Context-Aware Pseudo-Multi-Task Recommender System (CPMR) +to model the evolution in both historical and contextual scenarios by creating +three representations for each user and item under different dynamics: static +embedding, historical temporal states, and contextual temporal states. To +dually improve the performance of temporal states evolution and incremental +recommendation, we design a Pseudo-Multi-Task Learning (PMTL) paradigm by +stacking the incremental single-target recommendations into one multi-target +task for joint optimization. Within the PMTL paradigm, CPMR employs a +shared-bottom network to conduct the evolution of temporal states across +historical and contextual scenarios, as well as the fusion of them at the +user-item level. In addition, CPMR incorporates one real tower for incremental +predictions, and two pseudo towers dedicated to updating the respective +temporal states based on new batches of interactions. Experimental results on +four benchmark recommendation datasets show that CPMR consistently outperforms +state-of-the-art baselines and achieves significant gains on three of them. The +code is available at: https://github.com/DiMarzioBian/CPMR. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ☆ A Comprehensive Survey on Deep Learning Techniques in Educational Data + Mining + + +
+ Educational Data Mining (EDM) has emerged as a vital field of research, which +harnesses the power of computational techniques to analyze educational data. +With the increasing complexity and diversity of educational data, Deep Learning +techniques have shown significant advantages in addressing the challenges +associated with analyzing and modeling this data. This survey aims to +systematically review the state-of-the-art in EDM with Deep Learning. We begin +by providing a brief introduction to EDM and Deep Learning, highlighting their +relevance in the context of modern education. Next, we present a detailed +review of Deep Learning techniques applied in four typical educational +scenarios, including knowledge tracing, undesirable student detecting, +performance prediction, and personalized recommendation. Furthermore, a +comprehensive overview of public datasets and processing tools for EDM is +provided. Finally, we point out emerging trends and future directions in this +research area. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ Data Augmentation for Conversational AI + + +
+ Advancements in conversational systems have revolutionized information +access, surpassing the limitations of single queries. However, developing +dialogue systems requires a large amount of training data, which is a challenge +in low-resource domains and languages. Traditional data collection methods like +crowd-sourcing are labor-intensive and time-consuming, making them ineffective +in this context. Data augmentation (DA) is an affective approach to alleviate +the data scarcity problem in conversational systems. This tutorial provides a +comprehensive and up-to-date overview of DA approaches in the context of +conversational systems. It highlights recent advances in conversation +augmentation, open domain and task-oriented conversation generation, and +different paradigms of evaluating these models. We also discuss current +challenges and future directions in order to help researchers and practitioners +to further advance the field in this area. + +
+
+
+
+
+ + ☆ Analysis of Disinformation and Fake News Detection Using Fine-Tuned + Large Language Model + + +
+ The paper considers the possibility of fine-tuning Llama 2 large language +model (LLM) for the disinformation analysis and fake news detection. For +fine-tuning, the PEFT/LoRA based approach was used. In the study, the model was +fine-tuned for the following tasks: analysing a text on revealing +disinformation and propaganda narratives, fact checking, fake news detection, +manipulation analytics, extracting named entities with their sentiments. The +obtained results show that the fine-tuned Llama 2 model can perform a deep +analysis of texts and reveal complex styles and narratives. Extracted +sentiments for named entities can be considered as predictive features in +supervised machine learning models. + +
+
+
+
+
+ + ♻ ☆ EulerNet: Adaptive Feature Interaction Learning via Euler's Formula for + CTR Prediction SIGIR'23 + + +
+ Learning effective high-order feature interactions is very crucial in the CTR +prediction task. However, it is very time-consuming to calculate high-order +feature interactions with massive features in online e-commerce platforms. Most +existing methods manually design a maximal order and further filter out the +useless interactions from them. Although they reduce the high computational +costs caused by the exponential growth of high-order feature combinations, they +still suffer from the degradation of model capability due to the suboptimal +learning of the restricted feature orders. The solution to maintain the model +capability and meanwhile keep it efficient is a technical challenge, which has +not been adequately addressed. To address this issue, we propose an adaptive +feature interaction learning model, named as EulerNet, in which the feature +interactions are learned in a complex vector space by conducting space mapping +according to Euler's formula. EulerNet converts the exponential powers of +feature interactions into simple linear combinations of the modulus and phase +of the complex features, making it possible to adaptively learn the high-order +feature interactions in an efficient way. Furthermore, EulerNet incorporates +the implicit and explicit feature interactions into a unified architecture, +which achieves the mutual enhancement and largely boosts the model +capabilities. Such a network can be fully learned from data, with no need of +pre-designed form or order for feature interactions. Extensive experiments +conducted on three public datasets have demonstrated the effectiveness and +efficiency of our approach. Our code is available at: +https://github.com/RUCAIBox/EulerNet. + +
+
+ comment: 10 pages, 7 figures, accepted for publication in SIGIR'23 +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Towards Better Multi-modal Keyphrase Generation via Visual Entity + Enhancement and Multi-granularity Image Noise Filtering + + +
+ Multi-modal keyphrase generation aims to produce a set of keyphrases that +represent the core points of the input text-image pair. In this regard, +dominant methods mainly focus on multi-modal fusion for keyphrase generation. +Nevertheless, there are still two main drawbacks: 1) only a limited number of +sources, such as image captions, can be utilized to provide auxiliary +information. However, they may not be sufficient for the subsequent keyphrase +generation. 2) the input text and image are often not perfectly matched, and +thus the image may introduce noise into the model. To address these +limitations, in this paper, we propose a novel multi-modal keyphrase generation +model, which not only enriches the model input with external knowledge, but +also effectively filters image noise. First, we introduce external visual +entities of the image as the supplementary input to the model, which benefits +the cross-modal semantic alignment for keyphrase generation. Second, we +simultaneously calculate an image-text matching score and image region-text +correlation scores to perform multi-granularity image noise filtering. +Particularly, we introduce the correlation scores between image regions and +ground-truth keyphrases to refine the calculation of the previously-mentioned +correlation scores. To demonstrate the effectiveness of our model, we conduct +several groups of experiments on the benchmark dataset. + Experimental results and in-depth analyses show that our model achieves the +state-of-the-art performance. Our code is available on +https://github.com/DeepLearnXMU/MM-MKP. + +
+
+ comment: Accepted In Proceedings of the 31st ACM International Conference on + Multimedia (MM' 23) +
+
+
+
+
+ + ♻ ☆ AudioLDM 2: Learning Holistic Audio Generation with Self-supervised + Pretraining + + +
+ Although audio generation shares commonalities across different types of +audio, such as speech, music, and sound effects, designing models for each type +requires careful consideration of specific objectives and biases that can +significantly differ from those of other types. To bring us closer to a unified +perspective of audio generation, this paper proposes a framework that utilizes +the same learning method for speech, music, and sound effect generation. Our +framework introduces a general representation of audio, called "language of +audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a +self-supervised pre-trained representation learning model. In the generation +process, we translate any modalities into LOA by using a GPT-2 model, and we +perform self-supervised audio generation learning with a latent diffusion model +conditioned on LOA. The proposed framework naturally brings advantages such as +in-context learning abilities and reusable self-supervised pretrained AudioMAE +and latent diffusion models. Experiments on the major benchmarks of +text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art +or competitive performance against previous approaches. Our code, pretrained +model, and demo are available at https://audioldm.github.io/audioldm2. + +
+
+ comment: AudioLDM 2 project page is https://audioldm.github.io/audioldm2 +
+
+
+
+
+ + ♻ ☆ AudioLDM: Text-to-Audio Generation with Latent Diffusion Models ICML 2023 + + +
+ Text-to-audio (TTA) system has recently gained attention for its ability to +synthesize general audio based on text descriptions. However, previous studies +in TTA have limited generation quality with high computational costs. In this +study, we propose AudioLDM, a TTA system that is built on a latent space to +learn the continuous audio representations from contrastive language-audio +pretraining (CLAP) latents. The pretrained CLAP models enable us to train LDMs +with audio embedding while providing text embedding as a condition during +sampling. By learning the latent representations of audio signals and their +compositions without modeling the cross-modal relationship, AudioLDM is +advantageous in both generation quality and computational efficiency. Trained +on AudioCaps with a single GPU, AudioLDM achieves state-of-the-art TTA +performance measured by both objective and subjective metrics (e.g., frechet +distance). Moreover, AudioLDM is the first TTA system that enables various +text-guided audio manipulations (e.g., style transfer) in a zero-shot fashion. +Our implementation and demos are available at https://audioldm.github.io. + +
+
+ comment: Accepted by ICML 2023. Demo and implementation at + https://audioldm.github.io. Evaluation toolbox at + https://github.com/haoheliu/audioldm_eval +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 40 + +
+
+
+ + ☆ Measuring and Improving Chain-of-Thought Reasoning in Vision-Language + Models + + +
+ Vision-language models (VLMs) have recently demonstrated strong efficacy as +visual assistants that can parse natural queries about the visual content and +generate human-like outputs. In this work, we explore the ability of these +models to demonstrate human-like reasoning based on the perceived information. +To address a crucial concern regarding the extent to which their reasoning +capabilities are fully consistent and grounded, we also measure the reasoning +consistency of these models. We achieve this by proposing a chain-of-thought +(CoT) based consistency measure. However, such an evaluation requires a +benchmark that encompasses both high-level inference and detailed reasoning +chains, which is costly. We tackle this challenge by proposing a +LLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously +ensuring the generation of a high-quality dataset. Based on this pipeline and +the existing coarse-grained annotated dataset, we build the CURE benchmark to +measure both the zero-shot reasoning performance and consistency of VLMs. We +evaluate existing state-of-the-art VLMs, and find that even the best-performing +model is unable to demonstrate strong visual reasoning capabilities and +consistency, indicating that substantial efforts are required to enable VLMs to +perform visual reasoning as systematically and consistently as humans. As an +early step, we propose a two-stage training framework aimed at improving both +the reasoning performance and consistency of VLMs. The first stage involves +employing supervised fine-tuning of VLMs using step-by-step reasoning samples +automatically generated by LLMs. In the second stage, we further augment the +training process by incorporating feedback provided by LLMs to produce +reasoning chains that are highly consistent and grounded. We empirically +highlight the effectiveness of our framework in both reasoning performance and +consistency. + +
+
+ comment: The data is released at + \url{https://github.com/Yangyi-Chen/CoTConsistency} +
+
+
+
+
+ + ☆ CSPRD: A Financial Policy Retrieval Dataset for Chinese Stock Market + + +
+ In recent years, great advances in pre-trained language models (PLMs) have +sparked considerable research focus and achieved promising performance on the +approach of dense passage retrieval, which aims at retrieving relative passages +from massive corpus with given questions. However, most of existing datasets +mainly benchmark the models with factoid queries of general commonsense, while +specialised fields such as finance and economics remain unexplored due to the +deficiency of large-scale and high-quality datasets with expert annotations. In +this work, we propose a new task, policy retrieval, by introducing the Chinese +Stock Policy Retrieval Dataset (CSPRD), which provides 700+ prospectus passages +labeled by experienced experts with relevant articles from 10k+ entries in our +collected Chinese policy corpus. Experiments on lexical, embedding and +fine-tuned bi-encoder models show the effectiveness of our proposed CSPRD yet +also suggests ample potential for improvement. Our best performing baseline +achieves 56.1% MRR@10, 28.5% NDCG@10, 37.5% Recall@10 and 80.6% Precision@10 on +dev set. + +
+
+
+
+
+ + ☆ MoEController: Instruction-based Arbitrary Image Manipulation with + Mixture-of-Expert Controllers + + +
+ Diffusion-model-based text-guided image generation has recently made +astounding progress, producing fascinating results in open-domain image +manipulation tasks. Few models, however, currently have complete zero-shot +capabilities for both global and local image editing due to the complexity and +diversity of image manipulation tasks. In this work, we propose a method with a +mixture-of-expert (MOE) controllers to align the text-guided capacity of +diffusion models with different kinds of human instructions, enabling our model +to handle various open-domain image manipulation tasks with natural language +instructions. First, we use large language models (ChatGPT) and conditional +image synthesis models (ControlNet) to generate a large number of global image +transfer dataset in addition to the instruction-based local image editing +dataset. Then, using an MOE technique and task-specific adaptation training on +a large-scale dataset, our conditional diffusion model can edit images globally +and locally. Extensive experiments demonstrate that our approach performs +surprisingly well on various image manipulation tasks when dealing with +open-domain images and arbitrary human instructions. Please refer to our +project page: [https://oppo-mente-lab.github.io/moe_controller/] + +
+
+ comment: 5 pages,6 figures +
+
+
+
+
+ + ☆ Beyond Static Datasets: A Deep Interaction Approach to LLM Evaluation + + +
+ Large Language Models (LLMs) have made progress in various real-world tasks, +which stimulates requirements for the evaluation of LLMs. Existing LLM +evaluation methods are mainly supervised signal-based which depends on static +datasets and cannot evaluate the ability of LLMs in dynamic real-world +scenarios where deep interaction widely exists. Other LLM evaluation methods +are human-based which are costly and time-consuming and are incapable of +large-scale evaluation of LLMs. To address the issues above, we propose a novel +Deep Interaction-based LLM-evaluation framework. In our proposed framework, +LLMs' performances in real-world domains can be evaluated from their deep +interaction with other LLMs in elaborately designed evaluation tasks. +Furthermore, our proposed framework is a general evaluation method that can be +applied to a host of real-world tasks such as machine translation and code +generation. We demonstrate the effectiveness of our proposed method through +extensive experiments on four elaborately designed evaluation tasks. + +
+
+
+
+
+ + ☆ Encoding Multi-Domain Scientific Papers by Ensembling Multiple CLS + Tokens + + +
+ Many useful tasks on scientific documents, such as topic classification and +citation prediction, involve corpora that span multiple scientific domains. +Typically, such tasks are accomplished by representing the text with a vector +embedding obtained from a Transformer's single CLS token. In this paper, we +argue that using multiple CLS tokens could make a Transformer better specialize +to multiple scientific domains. We present Multi2SPE: it encourages each of +multiple CLS tokens to learn diverse ways of aggregating token embeddings, then +sums them up together to create a single vector representation. We also propose +our new multi-domain benchmark, Multi-SciDocs, to test scientific paper vector +encoders under multi-domain settings. We show that Multi2SPE reduces error by +up to 25 percent in multi-domain citation prediction, while requiring only a +negligible amount of computation in addition to one BERT forward pass. + +
+
+
+
+
+ + ☆ Fuzzy Fingerprinting Transformer Language-Models for Emotion Recognition + in Conversations + + +
+ Fuzzy Fingerprints have been successfully used as an interpretable text +classification technique, but, like most other techniques, have been largely +surpassed in performance by Large Pre-trained Language Models, such as BERT or +RoBERTa. These models deliver state-of-the-art results in several Natural +Language Processing tasks, namely Emotion Recognition in Conversations (ERC), +but suffer from the lack of interpretability and explainability. In this paper, +we propose to combine the two approaches to perform ERC, as a means to obtain +simpler and more interpretable Large Language Models-based classifiers. We +propose to feed the utterances and their previous conversational turns to a +pre-trained RoBERTa, obtaining contextual embedding utterance representations, +that are then supplied to an adapted Fuzzy Fingerprint classification module. +We validate our approach on the widely used DailyDialog ERC benchmark dataset, +in which we obtain state-of-the-art level results using a much lighter model. + +
+
+ comment: FUZZ-IEEE 2023 +
+
+
+
+
+ + ☆ From Sparse to Dense: GPT-4 Summarization with Chain of Density + Prompting + + +
+ Selecting the ``right'' amount of information to include in a summary is a +difficult task. A good summary should be detailed and entity-centric without +being overly dense and hard to follow. To better understand this tradeoff, we +solicit increasingly dense GPT-4 summaries with what we refer to as a ``Chain +of Density'' (CoD) prompt. Specifically, GPT-4 generates an initial +entity-sparse summary before iteratively incorporating missing salient entities +without increasing the length. Summaries generated by CoD are more abstractive, +exhibit more fusion, and have less of a lead bias than GPT-4 summaries +generated by a vanilla prompt. We conduct a human preference study on 100 CNN +DailyMail articles and find that that humans prefer GPT-4 summaries that are +more dense than those generated by a vanilla prompt and almost as dense as +human written summaries. Qualitative analysis supports the notion that there +exists a tradeoff between informativeness and readability. 500 annotated CoD +summaries, as well as an extra 5,000 unannotated summaries, are freely +available on HuggingFace +(https://huggingface.co/datasets/griffin/chain_of_density). + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ UQ at #SMM4H 2023: ALEX for Public Health Analysis with Social Media + + +
+ As social media becomes increasingly popular, more and more activities +related to public health emerge. Current techniques for public health analysis +involve popular models such as BERT and large language models (LLMs). However, +the costs of training in-domain LLMs for public health are especially +expensive. Furthermore, such kinds of in-domain datasets from social media are +generally imbalanced. To tackle these challenges, the data imbalance issue can +be overcome by data augmentation and balanced training. Moreover, the ability +of the LLMs can be effectively utilized by prompting the model properly. In +this paper, a novel ALEX framework is proposed to improve the performance of +public health analysis on social media by adopting an LLMs explanation +mechanism. Results show that our ALEX model got the best performance among all +submissions in both Task 2 and Task 4 with a high score in Task 1 in Social +Media Mining for Health 2023 (SMM4H)[1]. Our code has been released at https:// +github.com/YanJiangJerry/ALEX. + +
+
+
+
+
+ + ☆ The CALLA Dataset: Probing LLMs' Interactive Knowledge Acquisition from + Chinese Medical Literature + + +
+ The application of Large Language Models (LLMs) to the medical domain has +stimulated the interest of researchers. Recent studies have focused on +constructing Instruction Fine-Tuning (IFT) data through medical knowledge +graphs to enrich the interactive medical knowledge of LLMs. However, the +medical literature serving as a rich source of medical knowledge remains +unexplored. Our work introduces the CALLA dataset to probe LLMs' interactive +knowledge acquisition from Chinese medical literature. It assesses the +proficiency of LLMs in mastering medical knowledge through a free-dialogue +fact-checking task. We identify a phenomenon called the ``fact-following +response``, where LLMs tend to affirm facts mentioned in questions and display +a reluctance to challenge them. To eliminate the inaccurate evaluation caused +by this phenomenon, for the golden fact, we artificially construct test data +from two perspectives: one consistent with the fact and one inconsistent with +the fact. Drawing from the probing experiment on the CALLA dataset, we conclude +that IFT data highly correlated with the medical literature corpus serves as a +potent catalyst for LLMs, enabling themselves to skillfully employ the medical +knowledge acquired during the pre-training phase within interactive scenarios, +enhancing accuracy. Furthermore, we design a framework for automatically +constructing IFT data based on medical literature and discuss some real-world +applications. + +
+
+
+
+
+ + ☆ Knowledge-tuning Large Language Models with Structured Medical Knowledge + Bases for Reliable Response Generation in Chinese + + +
+ Large Language Models (LLMs) have demonstrated remarkable success in diverse +natural language processing (NLP) tasks in general domains. However, LLMs +sometimes generate responses with the hallucination about medical facts due to +limited domain knowledge. Such shortcomings pose potential risks in the +utilization of LLMs within medical contexts. To address this challenge, we +propose knowledge-tuning, which leverages structured medical knowledge bases +for the LLMs to grasp domain knowledge efficiently and facilitate reliable +response generation. We also release cMedKnowQA, a Chinese medical knowledge +question-answering dataset constructed from medical knowledge bases to assess +the medical knowledge proficiency of LLMs. Experimental results show that the +LLMs which are knowledge-tuned with cMedKnowQA, can exhibit higher levels of +accuracy in response generation compared with vanilla instruction-tuning and +offer a new reliable way for the domain adaptation of LLMs. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Manifold-based Verbalizer Space Re-embedding for Tuning-free + Prompt-based Classification + + +
+ Prompt-based classification adapts tasks to a cloze question format utilizing +the [MASK] token and the filled tokens are then mapped to labels through +pre-defined verbalizers. Recent studies have explored the use of verbalizer +embeddings to reduce labor in this process. However, all existing studies +require a tuning process for either the pre-trained models or additional +trainable embeddings. Meanwhile, the distance between high-dimensional +verbalizer embeddings should not be measured by Euclidean distance due to the +potential for non-linear manifolds in the representation space. In this study, +we propose a tuning-free manifold-based space re-embedding method called +Locally Linear Embedding with Intra-class Neighborhood Constraint (LLE-INC) for +verbalizer embeddings, which preserves local properties within the same class +as guidance for classification. Experimental results indicate that even without +tuning any parameters, our LLE-INC is on par with automated verbalizers with +parameter tuning. And with the parameter updating, our approach further +enhances prompt-based tuning by up to 3.2%. Furthermore, experiments with the +LLaMA-7B&13B indicate that LLE-INC is an efficient tuning-free classification +approach for the hyper-scale language models. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ GLS-CSC: A Simple but Effective Strategy to Mitigate Chinese STM Models' + Over-Reliance on Superficial Clue + + +
+ Pre-trained models have achieved success in Chinese Short Text Matching (STM) +tasks, but they often rely on superficial clues, leading to a lack of robust +predictions. To address this issue, it is crucial to analyze and mitigate the +influence of superficial clues on STM models. Our study aims to investigate +their over-reliance on the edit distance feature, commonly used to measure the +semantic similarity of Chinese text pairs, which can be considered a +superficial clue. To mitigate STM models' over-reliance on superficial clues, +we propose a novel resampling training strategy called Gradually Learn Samples +Containing Superficial Clue (GLS-CSC). Through comprehensive evaluations of +In-Domain (I.D.), Robustness (Rob.), and Out-Of-Domain (O.O.D.) test sets, we +demonstrate that GLS-CSC outperforms existing methods in terms of enhancing the +robustness and generalization of Chinese STM models. Moreover, we conduct a +detailed analysis of existing methods and reveal their commonality. + +
+
+
+
+
+ + ☆ Cross-Utterance Conditioned VAE for Speech Generation + + +
+ Speech synthesis systems powered by neural networks hold promise for +multimedia production, but frequently face issues with producing expressive +speech and seamless editing. In response, we present the Cross-Utterance +Conditioned Variational Autoencoder speech synthesis (CUC-VAE S2) framework to +enhance prosody and ensure natural speech generation. This framework leverages +the powerful representational capabilities of pre-trained language models and +the re-expression abilities of variational autoencoders (VAEs). The core +component of the CUC-VAE S2 framework is the cross-utterance CVAE, which +extracts acoustic, speaker, and textual features from surrounding sentences to +generate context-sensitive prosodic features, more accurately emulating human +prosody generation. We further propose two practical algorithms tailored for +distinct speech synthesis applications: CUC-VAE TTS for text-to-speech and +CUC-VAE SE for speech editing. The CUC-VAE TTS is a direct application of the +framework, designed to generate audio with contextual prosody derived from +surrounding texts. On the other hand, the CUC-VAE SE algorithm leverages real +mel spectrogram sampling conditioned on contextual information, producing audio +that closely mirrors real sound and thereby facilitating flexible speech +editing based on text such as deletion, insertion, and replacement. +Experimental results on the LibriTTS datasets demonstrate that our proposed +models significantly enhance speech synthesis and editing, producing more +natural and expressive speech. + +
+
+ comment: 13 pages; +
+
+
+
+
+ + ☆ NESTLE: a No-Code Tool for Statistical Analysis of Legal Corpus + + +
+ The statistical analysis of large scale legal corpus can provide valuable +legal insights. For such analysis one needs to (1) select a subset of the +corpus using document retrieval tools, (2) structuralize text using information +extraction (IE) systems, and (3) visualize the data for the statistical +analysis. Each process demands either specialized tools or programming skills +whereas no comprehensive unified "no-code" tools have been available. +Especially for IE, if the target information is not predefined in the ontology +of the IE system, one needs to build their own system. Here we provide NESTLE, +a no code tool for large-scale statistical analysis of legal corpus. With +NESTLE, users can search target documents, extract information, and visualize +the structured data all via the chat interface with accompanying auxiliary GUI +for the fine-level control. NESTLE consists of three main components: a search +engine, an end-to-end IE system, and a Large Language Model (LLM) that glues +the whole components together and provides the chat interface. Powered by LLM +and the end-to-end IE system, NESTLE can extract any type of information that +has not been predefined in the IE system opening up the possibility of +unlimited customizable statistical analysis of the corpus without writing a +single line of code. The use of the custom end-to-end IE system also enables +faster and low-cost IE on large scale corpus. We validate our system on 15 +Korean precedent IE tasks and 3 legal text classification tasks from LEXGLUE. +The comprehensive experiments reveal NESTLE can achieve GPT-4 comparable +performance by training the internal IE module with 4 human-labeled, and 192 +LLM-labeled examples. The detailed analysis provides the insight on the +trade-off between accuracy, time, and cost in building such system. + +
+
+
+
+
+ + ☆ RST-style Discourse Parsing Guided by Document-level Content Structures + + +
+ Rhetorical Structure Theory based Discourse Parsing (RST-DP) explores how +clauses, sentences, and large text spans compose a whole discourse and presents +the rhetorical structure as a hierarchical tree. Existing RST parsing pipelines +construct rhetorical structures without the knowledge of document-level content +structures, which causes relatively low performance when predicting the +discourse relations for large text spans. Recognizing the value of high-level +content-related information in facilitating discourse relation recognition, we +propose a novel pipeline for RST-DP that incorporates structure-aware news +content sentence representations derived from the task of News Discourse +Profiling. By incorporating only a few additional layers, this enhanced +pipeline exhibits promising performance across various RST parsing metrics. + +
+
+
+
+
+ + ☆ Meta predictive learning model of natural languages + + +
+ Large language models based on self-attention mechanisms have achieved +astonishing performances not only in natural language itself, but also in a +variety of tasks of different nature. However, regarding processing language, +our human brain may not operate using the same principle. Then, a debate is +established on the connection between brain computation and artificial +self-supervision adopted in large language models. One of most influential +hypothesis in brain computation is the predictive coding framework, which +proposes to minimize the prediction error by local learning. However, the role +of predictive coding and the associated credit assignment in language +processing remains unknown. Here, we propose a mean-field learning model within +the predictive coding framework, assuming that the synaptic weight of each +connection follows a spike and slab distribution, and only the distribution is +trained. This meta predictive learning is successfully validated on classifying +handwritten digits where pixels are input to the network in sequence, and on +the toy and real language corpus. Our model reveals that most of the +connections become deterministic after learning, while the output connections +have a higher level of variability. The performance of the resulting network +ensemble changes continuously with data load, further improving with more +training data, in analogy with the emergent behavior of large language models. +Therefore, our model provides a starting point to investigate the physics and +biology correspondences of the language processing and the unexpected general +intelligence. + +
+
+ comment: 23 pages, 6 figures, codes are available in the main text with the + link +
+
+
+
+
+ + ☆ Unsupervised Multi-document Summarization with Holistic Inference AACL 2023 + + +
+ Multi-document summarization aims to obtain core information from a +collection of documents written on the same topic. This paper proposes a new +holistic framework for unsupervised multi-document extractive summarization. +Our method incorporates the holistic beam search inference method associated +with the holistic measurements, named Subset Representative Index (SRI). SRI +balances the importance and diversity of a subset of sentences from the source +documents and can be calculated in unsupervised and adaptive manners. To +demonstrate the effectiveness of our method, we conduct extensive experiments +on both small and large-scale multi-document summarization datasets under both +unsupervised and adaptive settings. The proposed method outperforms strong +baselines by a significant margin, as indicated by the resulting ROUGE scores +and diversity measures. Our findings also suggest that diversity is essential +for improving multi-document summary performance. + +
+
+ comment: Findings of IJCNLP-AACL 2023 +
+
+
+
+
+ + ☆ Can NLP Models 'Identify', 'Distinguish', and 'Justify' Questions that + Don't have a Definitive Answer? ACL 2023 + + +
+ Though state-of-the-art (SOTA) NLP systems have achieved remarkable +performance on a variety of language understanding tasks, they primarily focus +on questions that have a correct and a definitive answer. However, in +real-world applications, users often ask questions that don't have a definitive +answer. Incorrectly answering such questions certainly hampers a system's +reliability and trustworthiness. Can SOTA models accurately identify such +questions and provide a reasonable response? + To investigate the above question, we introduce QnotA, a dataset consisting +of five different categories of questions that don't have definitive answers. +Furthermore, for each QnotA instance, we also provide a corresponding QA +instance i.e. an alternate question that ''can be'' answered. With this data, +we formulate three evaluation tasks that test a system's ability to 'identify', +'distinguish', and 'justify' QnotA questions. Through comprehensive +experiments, we show that even SOTA models including GPT-3 and Flan T5 do not +fare well on these tasks and lack considerably behind the human performance +baseline. We conduct a thorough analysis which further leads to several +interesting findings. Overall, we believe our work and findings will encourage +and facilitate further research in this important area and help develop more +robust models. + +
+
+ comment: TrustNLP Workshop at ACL 2023 +
+
+
+
+
+ + ☆ Linking Symptom Inventories using Semantic Textual Similarity + + +
+ An extensive library of symptom inventories has been developed over time to +measure clinical symptoms, but this variety has led to several long standing +issues. Most notably, results drawn from different settings and studies are not +comparable, which limits reproducibility. Here, we present an artificial +intelligence (AI) approach using semantic textual similarity (STS) to link +symptoms and scores across previously incongruous symptom inventories. We +tested the ability of four pre-trained STS models to screen thousands of +symptom description pairs for related content - a challenging task typically +requiring expert panels. Models were tasked to predict symptom severity across +four different inventories for 6,607 participants drawn from 16 international +data sources. The STS approach achieved 74.8% accuracy across five tasks, +outperforming other models tested. This work suggests that incorporating +contextual, semantic information can assist expert decision-making processes, +yielding gains for both general and disease-specific clinical assessment. + +
+
+
+
+
+ + ☆ When Less is More: Investigating Data Pruning for Pretraining LLMs at + Scale + + +
+ Large volumes of text data have contributed significantly to the development +of large language models (LLMs) in recent years. This data is typically +acquired by scraping the internet, leading to pretraining datasets comprised of +noisy web text. To date, efforts to prune these datasets down to a higher +quality subset have relied on hand-crafted heuristics encoded as rule-based +filters. In this work, we take a wider view and explore scalable estimates of +data quality that can be used to systematically measure the quality of +pretraining data. We perform a rigorous comparison at scale of the simple data +quality estimator of perplexity, as well as more sophisticated and +computationally intensive estimates of the Error L2-Norm and memorization. +These metrics are used to rank and prune pretraining corpora, and we +subsequently compare LLMs trained on these pruned datasets. Surprisingly, we +find that the simple technique of perplexity outperforms our more +computationally expensive scoring methods. We improve over our no-pruning +baseline while training on as little as 30% of the original training dataset. +Our work sets the foundation for unexplored strategies in automatically +curating high quality corpora and suggests the majority of pretraining data can +be removed while retaining performance. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ☆ Three Ways to Improve Verbo-visual Fusion for Dense 3D Visual Grounding ICCV 2023 + + +
+ 3D visual grounding is the task of localizing the object in a 3D scene which +is referred by a description in natural language. With a wide range of +applications ranging from autonomous indoor robotics to AR/VR, the task has +recently risen in popularity. A common formulation to tackle 3D visual +grounding is grounding-by-detection, where localization is done via bounding +boxes. However, for real-life applications that require physical interactions, +a bounding box insufficiently describes the geometry of an object. We therefore +tackle the problem of dense 3D visual grounding, i.e. referral-based 3D +instance segmentation. We propose a dense 3D grounding network ConcreteNet, +featuring three novel stand-alone modules which aim to improve grounding +performance for challenging repetitive instances, i.e. instances with +distractors of the same semantic class. First, we introduce a bottom-up +attentive fusion module that aims to disambiguate inter-instance relational +cues, next we construct a contrastive training scheme to induce separation in +the latent space, and finally we resolve view-dependent utterances via a +learned global camera token. ConcreteNet ranks 1st on the challenging ScanRefer +online benchmark by a considerable +9.43% accuracy at 50% IoU and has won the +ICCV 3rd Workshop on Language for 3D Scenes "3D Object Localization" challenge. + +
+
+ comment: Winner of the ICCV 2023 ScanRefer Challenge. This work has been + submitted to the IEEE for possible publication. Copyright may be transferred + without notice, after which this version may no longer be accessible +
+
+
+
+
+ + ☆ Retrieving Evidence from EHRs with LLMs: Possibilities and Challenges + + +
+ Unstructured Electronic Health Record (EHR) data often contains critical +information complementary to imaging data that would inform radiologists' +diagnoses. However, time constraints and the large volume of notes frequently +associated with individual patients renders manual perusal of such data to +identify relevant evidence infeasible in practice. Modern Large Language Models +(LLMs) provide a flexible means of interacting with unstructured EHR data, and +may provide a mechanism to efficiently retrieve and summarize unstructured +evidence relevant to a given query. In this work, we propose and evaluate an +LLM (Flan-T5 XXL) for this purpose. Specifically, in a zero-shot setting we +task the LLM to infer whether a patient has or is at risk of a particular +condition; if so, we prompt the model to summarize the supporting evidence. +Enlisting radiologists for manual evaluation, we find that this LLM-based +approach provides outputs consistently preferred to a standard information +retrieval baseline, but we also highlight the key outstanding challenge: LLMs +are prone to hallucinating evidence. However, we provide results indicating +that model confidence in outputs might indicate when LLMs are hallucinating, +potentially providing a means to address this. + +
+
+
+
+
+ + ♻ ☆ Entity Tracking in Language Models ACL 2023 + + +
+ Keeping track of how states of entities change as a text or dialog unfolds is +a key prerequisite to discourse understanding. Yet, there have been few +systematic investigations into the ability of large language models (LLMs) to +track discourse entities. In this work, we present a task probing to what +extent a language model can infer the final state of an entity given an English +description of the initial state and a series of state-changing operations. We +use this task to first investigate whether Flan-T5, GPT-3 and GPT-3.5 can track +the state of entities, and find that only GPT-3.5 models, which have been +pretrained on large amounts of code, exhibit this ability. We then investigate +whether smaller models pretrained primarily on text can learn to track +entities, through finetuning T5 on several training/evaluation splits. While +performance degrades for more complex splits, we find that even when evaluated +on a different set of entities from training or longer operation sequences, a +finetuned model can perform non-trivial entity tracking. Taken together, these +results suggest that language models can learn to track entities but +pretraining on text corpora alone does not make this capacity surface. + +
+
+ comment: ACL 2023 Camera-ready +
+
+
+
+
+ + ♻ ☆ A Conditional Generative Chatbot using Transformer Model + + +
+ A Chatbot serves as a communication tool between a human user and a machine +to achieve an appropriate answer based on the human input. In more recent +approaches, a combination of Natural Language Processing and sequential models +are used to build a generative Chatbot. The main challenge of these models is +their sequential nature, which leads to less accurate results. To tackle this +challenge, in this paper, a novel architecture is proposed using conditional +Wasserstein Generative Adversarial Networks and a transformer model for answer +generation in Chatbots. While the generator of the proposed model consists of a +full transformer model to generate an answer, the discriminator includes only +the encoder part of a transformer model followed by a classifier. To the best +of our knowledge, this is the first time that a generative Chatbot is proposed +using the embedded transformer in both generator and discriminator models. +Relying on the parallel computing of the transformer model, the results of the +proposed model on the Cornell Movie-Dialog corpus and the Chit-Chat datasets +confirm the superiority of the proposed model compared to state-of-the-art +alternatives using different evaluation metrics. + +
+
+
+
+
+ + ♻ ☆ Large Content And Behavior Models To Understand, Simulate, And Optimize + Content And Behavior + + +
+ Shannon, in his seminal paper introducing information theory, divided the +communication into three levels: technical, semantic, and effectivenss. While +the technical level is concerned with accurate reconstruction of transmitted +symbols, the semantic and effectiveness levels deal with the inferred meaning +and its effect on the receiver. Thanks to telecommunications, the first level +problem has produced great advances like the internet. Large Language Models +(LLMs) make some progress towards the second goal, but the third level still +remains largely untouched. The third problem deals with predicting and +optimizing communication for desired receiver behavior. LLMs, while showing +wide generalization capabilities across a wide range of tasks, are unable to +solve for this. One reason for the underperformance could be a lack of +"behavior tokens" in LLMs' training corpora. Behavior tokens define receiver +behavior over a communication, such as shares, likes, clicks, purchases, +retweets, etc. While preprocessing data for LLM training, behavior tokens are +often removed from the corpora as noise. Therefore, in this paper, we make some +initial progress towards reintroducing behavior tokens in LLM training. The +trained models, other than showing similar performance to LLMs on content +understanding tasks, show generalization capabilities on behavior simulation, +content simulation, behavior understanding, and behavior domain adaptation. +Using a wide range of tasks on two corpora, we show results on all these +capabilities. We call these models Large Content and Behavior Models (LCBMs). +Further, to spur more research on LCBMs, we release our new Content Behavior +Corpus (CBC), a repository containing communicator, message, and corresponding +receiver behavior. + +
+
+
+
+
+ + ♻ ☆ On Large Language Models' Selection Bias in Multi-Choice Questions + + +
+ Multi-choice questions (MCQs) serve as a common yet important task format in +the research of large language models (LLMs). Our work shows that LLMs exhibit +an inherent "selection bias" in MCQs, which refers to LLMs' preferences to +select options located at specific positions (like "Option C"). This bias is +prevalent across various LLMs, making their performance vulnerable to option +position changes in MCQs. We identify that one primary cause resulting in +selection bias is option numbering, i.e., the ID symbols A/B/C/D associated +with the options. To mitigate selection bias, we propose a new method called +PriDe. PriDe first decomposes the observed model prediction distribution into +an intrinsic prediction over option contents and a prior distribution over +option IDs. It then estimates the prior by permutating option contents on a +small number of test samples, which is used to debias the subsequent test +samples. We demonstrate that, as a label-free, inference-time method, PriDe +achieves a more effective and computation-efficient debiasing than strong +baselines. We further show that the priors estimated by PriDe generalize well +across different domains, highlighting its practical potential in broader +scenarios. + +
+
+ comment: Work in progress. 21 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ ValiTex -- a unified validation framework for computational text-based + measures of social science constructs + + +
+ Guidance on how to validate computational text-based measures of social +science constructs is fragmented. Although scholars generally acknowledge the +importance of validating their text-based measures, they often lack common +terminology and a unified framework to do so. This paper introduces ValiTex, a +new validation framework designed to assist scholars in validly measuring +social science constructs based on textual data. The framework draws on a +long-established validity concept in psychometrics but extends these concepts +to cover the specific needs of computational text analysis. ValiTex consists of +two components, a conceptual framework and a dynamic checklist. Whereas the +conceptual framework provides a general structure along distinct phases on how +to approach validation, the dynamic checklist defines specific validation steps +and provides guidance on which steps might be considered recommendable (i.e., +providing relevant and necessary validation evidence) or optional (i.e., useful +for providing additional supporting validation evidence). We demonstrate the +utility of the framework by applying it to a use case of detecting sexism from +social media data + +
+
+
+
+
+ + ♻ ☆ Simple LLM Prompting is State-of-the-Art for Robust and Multilingual + Dialogue Evaluation + + +
+ Despite significant research effort in the development of automatic dialogue +evaluation metrics, little thought is given to evaluating dialogues other than +in English. At the same time, ensuring metrics are invariant to semantically +similar responses is also an overlooked topic. In order to achieve the desired +properties of robustness and multilinguality for dialogue evaluation metrics, +we propose a novel framework that takes advantage of the strengths of current +evaluation models with the newly-established paradigm of prompting Large +Language Models (LLMs). Empirical results show our framework achieves state of +the art results in terms of mean Spearman correlation scores across several +benchmarks and ranks first place on both the Robust and Multilingual tasks of +the DSTC11 Track 4 "Automatic Evaluation Metrics for Open-Domain Dialogue +Systems", proving the evaluation capabilities of prompted LLMs. + +
+
+ comment: DSTC11 best paper for Track 4 +
+
+
+
+
+ + ♻ ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborate their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. To +explore the potential of open-source LLM, we investigate them in various +scenarios, and further enhance their performance with supervised fine-tuning. +Our explorations highlight open-source LLMs' potential in Text-to-SQL, as well +as the advantages and disadvantages of the supervised fine-tuning. +Additionally, towards an efficient and economic LLM-based Text-to-SQL solution, +we emphasize the token efficiency in prompt engineering and compare the prior +studies under this metric. We hope that our work provides a deeper +understanding of Text-to-SQL with LLMs, and inspires further investigations and +broad applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ♻ ☆ TikTalk: A Video-Based Dialogue Dataset for Multi-Modal Chitchat in Real + World + + +
+ To facilitate the research on intelligent and human-like chatbots with +multi-modal context, we introduce a new video-based multi-modal dialogue +dataset, called TikTalk. We collect 38K videos from a popular video-sharing +platform, along with 367K conversations posted by users beneath them. Users +engage in spontaneous conversations based on their multi-modal experiences from +watching videos, which helps recreate real-world chitchat context. Compared to +previous multi-modal dialogue datasets, the richer context types in TikTalk +lead to more diverse conversations, but also increase the difficulty in +capturing human interests from intricate multi-modal information to generate +personalized responses. Moreover, external knowledge is more frequently evoked +in our dataset. These facts reveal new challenges for multi-modal dialogue +models. We quantitatively demonstrate the characteristics of TikTalk, propose a +video-based multi-modal chitchat task, and evaluate several dialogue baselines. +Experimental results indicate that the models incorporating large language +models (LLM) can generate more diverse responses, while the model utilizing +knowledge graphs to introduce external knowledge performs the best overall. +Furthermore, no existing model can solve all the above challenges well. There +is still a large room for future improvements, even for LLM with visual +extensions. Our dataset is available at +\url{https://ruc-aimind.github.io/projects/TikTalk/}. + +
+
+ comment: Accepted to ACM Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ Detecting Text Formality: A Study of Text Classification Approaches + + +
+ Formality is one of the important characteristics of text documents. The +automatic detection of the formality level of a text is potentially beneficial +for various natural language processing tasks. Before, two large-scale datasets +were introduced for multiple languages featuring formality annotation -- GYAFC +and X-FORMAL. However, they were primarily used for the training of style +transfer models. At the same time, the detection of text formality on its own +may also be a useful application. This work proposes the first to our knowledge +systematic study of formality detection methods based on statistical, +neural-based, and Transformer-based machine learning methods and delivers the +best-performing models for public usage. We conducted three types of +experiments -- monolingual, multilingual, and cross-lingual. The study shows +the overcome of Char BiLSTM model over Transformer-based ones for the +monolingual and multilingual formality classification task, while +Transformer-based classifiers are more stable to cross-lingual knowledge +transfer. + +
+
+ comment: Published at RANLP2023 +
+
+
+
+
+ + ♻ ☆ Less is More: A Lightweight and Robust Neural Architecture for Discourse + Parsing + + +
+ Complex feature extractors are widely employed for text representation +building. However, these complex feature extractors make the NLP systems prone +to overfitting especially when the downstream training datasets are relatively +small, which is the case for several discourse parsing tasks. Thus, we propose +an alternative lightweight neural architecture that removes multiple complex +feature extractors and only utilizes learnable self-attention modules to +indirectly exploit pretrained neural language models, in order to maximally +preserve the generalizability of pre-trained language models. Experiments on +three common discourse parsing tasks show that powered by recent pretrained +language models, the lightweight architecture consisting of only two +self-attention layers obtains much better generalizability and robustness. +Meanwhile, it achieves comparable or even better system performance with fewer +learnable parameters and less processing time. + +
+
+
+
+
+ + ♻ ☆ From Quantity to Quality: Boosting LLM Performance with Self-Guided Data + Selection for Instruction Tuning + + +
+ In the realm of Large Language Models, the balance between instruction data +quality and quantity has become a focal point. Recognizing this, we introduce a +self-guided methodology for LLMs to autonomously discern and select cherry +samples from vast open-source datasets, effectively minimizing manual curation +and potential cost for instruction tuning an LLM. Our key innovation, the +Instruction-Following Difficulty (IFD) metric, emerges as a pivotal tool to +identify discrepancies between a model's expected responses and its autonomous +generation prowess. Through the adept application of IFD, cherry samples are +pinpointed, leading to a marked uptick in model training efficiency. Empirical +validations on renowned datasets like Alpaca and WizardLM underpin our +findings; with a mere 10% of conventional data input, our strategy showcases +improved results. This synthesis of self-guided cherry-picking and the IFD +metric signifies a transformative leap in the optimization of LLMs, promising +both efficiency and resource-conscious advancements. + +
+
+
+
+
+ + ♻ ☆ Exploring an LM to generate Prolog Predicates from Mathematics Questions + + +
+ Recently, there has been a surge in interest in NLP driven by ChatGPT. +ChatGPT, a transformer-based generative language model of substantial scale, +exhibits versatility in performing various tasks based on natural language. +Nevertheless, large language models often exhibit poor performance in solving +mathematics questions that require reasoning. Prior research has demonstrated +the effectiveness of chain-of-thought prompting in enhancing reasoning +capabilities. Now, we aim to investigate whether fine-tuning a model for the +generation of Prolog codes, a logic language, and subsequently passing these +codes to a compiler can further improve accuracy. Consequently, we employ +chain-of-thought to fine-tune LLaMA7B as a baseline model and develop other +fine-tuned LLaMA7B models for the generation of Prolog code, Prolog code + +chain-of-thought, and chain-of-thought + Prolog code, respectively. The results +reveal that the Prolog generation model surpasses the baseline in performance, +while the combination generation models do not yield significant improvements. +The Prolog corpus based on GSM8K and the correspondingly finetuned Prolog +generation model based on LLaMA7B are released to the research community. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ The reasoning capabilities of Large Language Models (LLMs) play a pivotal +role in the realm of embodied artificial intelligence. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ All Labels Together: Low-shot Intent Detection with an Efficient Label + Semantic Encoding Paradigm AACL 2023 + + +
+ In intent detection tasks, leveraging meaningful semantic information from +intent labels can be particularly beneficial for few-shot scenarios. However, +existing few-shot intent detection methods either ignore the intent labels, +(e.g. treating intents as indices) or do not fully utilize this information +(e.g. only using part of the intent labels). In this work, we present an +end-to-end One-to-All system that enables the comparison of an input utterance +with all label candidates. The system can then fully utilize label semantics in +this way. Experiments on three few-shot intent detection tasks demonstrate that +One-to-All is especially effective when the training resource is extremely +scarce, achieving state-of-the-art performance in 1-, 3- and 5-shot settings. +Moreover, we present a novel pretraining strategy for our model that utilizes +indirect supervision from paraphrasing, enabling zero-shot cross-domain +generalization on intent detection tasks. Our code is at +https://github.com/jiangshdd/AllLablesTogether. + +
+
+ comment: Accepted by IJCNLP-AACL 2023 +
+
+
+
+
+ + ♻ ☆ Annotation Imputation to Individualize Predictions: Initial Studies on + Distribution Dynamics and Model Predictions + + +
+ Annotating data via crowdsourcing is time-consuming and expensive. Due to +these costs, dataset creators often have each annotator label only a small +subset of the data. This leads to sparse datasets with examples that are marked +by few annotators. The downside of this process is that if an annotator doesn't +get to label a particular example, their perspective on it is missed. This is +especially concerning for subjective NLP datasets where there is no single +correct label: people may have different valid opinions. Thus, we propose using +imputation methods to generate the opinions of all annotators for all examples, +creating a dataset that does not leave out any annotator's view. We then train +and prompt models, using data from the imputed dataset, to make predictions +about the distribution of responses and individual annotations. + In our analysis of the results, we found that the choice of imputation method +significantly impacts soft label changes and distribution. While the imputation +introduces noise in the prediction of the original dataset, it has shown +potential in enhancing shots for prompts, particularly for low-response-rate +annotators. We have made all of our code and data publicly available. + +
+
+ comment: NLPerspectives 2023 Conference, 39 pages, 13 figures, 13 tables +
+
+
+
+
+ + ♻ ☆ What Are People Asking About COVID-19? A Question Classification Dataset ACL 2020 + + +
+ We present COVID-Q, a set of 1,690 questions about COVID-19 from 13 sources, +which we annotate into 15 question categories and 207 question clusters. The +most common questions in our dataset asked about transmission, prevention, and +societal effects of COVID, and we found that many questions that appeared in +multiple sources were not answered by any FAQ websites of reputable +organizations such as the CDC and FDA. We post our dataset publicly at +https://github.com/JerryWeiAI/COVID-Q. For classifying questions into 15 +categories, a BERT baseline scored 58.1% accuracy when trained on 20 examples +per category, and for a question clustering task, a BERT + triplet loss +baseline achieved 49.5% accuracy. We hope COVID-Q can help either for direct +use in developing applied systems or as a domain-specific resource for model +evaluation. + +
+
+ comment: Published in Proceedings of the 1st Workshop on NLP for COVID-19 at + ACL 2020 +
+
+
+
+
+ + ♻ ☆ Language as a Latent Sequence: deep latent variable models for + semi-supervised paraphrase generation + + +
+ This paper explores deep latent variable models for semi-supervised +paraphrase generation, where the missing target pair for unlabelled data is +modelled as a latent paraphrase sequence. We present a novel unsupervised model +named variational sequence auto-encoding reconstruction (VSAR), which performs +latent sequence inference given an observed text. To leverage information from +text pairs, we additionally introduce a novel supervised model we call dual +directional learning (DDL), which is designed to integrate with our proposed +VSAR model. Combining VSAR with DDL (DDL+VSAR) enables us to conduct +semi-supervised learning. Still, the combined model suffers from a cold-start +problem. To further combat this issue, we propose an improved weight +initialisation solution, leading to a novel two-stage training scheme we call +knowledge-reinforced-learning (KRL). Our empirical evaluations suggest that the +combined model yields competitive performance against the state-of-the-art +supervised baselines on complete data. Furthermore, in scenarios where only a +fraction of the labelled pairs are available, our combined model consistently +outperforms the strong supervised model baseline (DDL) by a significant margin +(p <.05; Wilcoxon test). Our code is publicly available at +"https://github.com/jialin-yu/latent-sequence-paraphrase". + +
+
+
+
+
+ + ♻ ☆ Right to be Forgotten in the Era of Large Language Models: Implications, + Challenges, and Solutions + + +
+ The Right to be Forgotten (RTBF) was first established as the result of the +ruling of Google Spain SL, Google Inc. v AEPD, Mario Costeja Gonz\'alez, and +was later included as the Right to Erasure under the General Data Protection +Regulation (GDPR) of European Union to allow individuals the right to request +personal data be deleted by organizations. Specifically for search engines, +individuals can send requests to organizations to exclude their information +from the query results. It was a significant emergent right as the result of +the evolution of technology. With the recent development of Large Language +Models (LLMs) and their use in chatbots, LLM-enabled software systems have +become popular. But they are not excluded from the RTBF. Compared with the +indexing approach used by search engines, LLMs store, and process information +in a completely different way. This poses new challenges for compliance with +the RTBF. In this paper, we explore these challenges and provide our insights +on how to implement technical solutions for the RTBF, including the use of +differential privacy, machine unlearning, model editing, and prompt +engineering. With the rapid advancement of AI and the increasing need of +regulating this powerful technology, learning from the case of RTBF can provide +valuable lessons for technical practitioners, legal experts, organizations, and +authorities. + +
+
+ comment: The new version made the following changes: 1. added an "on-going + discussion" section and relevant references 2. added a stream of solutions + (privacy-preserving machine learning) to technical solutions section 3. made + minor changes on descriptions of certain technical terms 4. added references + to some recent law proposals and court rulings +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 66 + +
+
+
+ + ☆ Generalized Cross-domain Multi-label Few-shot Learning for Chest X-rays + + +
+ Real-world application of chest X-ray abnormality classification requires +dealing with several challenges: (i) limited training data; (ii) training and +evaluation sets that are derived from different domains; and (iii) classes that +appear during training may have partial overlap with classes of interest during +evaluation. To address these challenges, we present an integrated framework +called Generalized Cross-Domain Multi-Label Few-Shot Learning (GenCDML-FSL). +The framework supports overlap in classes during training and evaluation, +cross-domain transfer, adopts meta-learning to learn using few training +samples, and assumes each chest X-ray image is either normal or associated with +one or more abnormalities. Furthermore, we propose Generalized Episodic +Training (GenET), a training strategy that equips models to operate with +multiple challenges observed in the GenCDML-FSL scenario. Comparisons with +well-established methods such as transfer learning, hybrid transfer learning, +and multi-label meta-learning on multiple datasets show the superiority of our +approach. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Measuring and Improving Chain-of-Thought Reasoning in Vision-Language + Models + + +
+ Vision-language models (VLMs) have recently demonstrated strong efficacy as +visual assistants that can parse natural queries about the visual content and +generate human-like outputs. In this work, we explore the ability of these +models to demonstrate human-like reasoning based on the perceived information. +To address a crucial concern regarding the extent to which their reasoning +capabilities are fully consistent and grounded, we also measure the reasoning +consistency of these models. We achieve this by proposing a chain-of-thought +(CoT) based consistency measure. However, such an evaluation requires a +benchmark that encompasses both high-level inference and detailed reasoning +chains, which is costly. We tackle this challenge by proposing a +LLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously +ensuring the generation of a high-quality dataset. Based on this pipeline and +the existing coarse-grained annotated dataset, we build the CURE benchmark to +measure both the zero-shot reasoning performance and consistency of VLMs. We +evaluate existing state-of-the-art VLMs, and find that even the best-performing +model is unable to demonstrate strong visual reasoning capabilities and +consistency, indicating that substantial efforts are required to enable VLMs to +perform visual reasoning as systematically and consistently as humans. As an +early step, we propose a two-stage training framework aimed at improving both +the reasoning performance and consistency of VLMs. The first stage involves +employing supervised fine-tuning of VLMs using step-by-step reasoning samples +automatically generated by LLMs. In the second stage, we further augment the +training process by incorporating feedback provided by LLMs to produce +reasoning chains that are highly consistent and grounded. We empirically +highlight the effectiveness of our framework in both reasoning performance and +consistency. + +
+
+ comment: The data is released at + \url{https://github.com/Yangyi-Chen/CoTConsistency} +
+
+
+
+
+ + ☆ WiSARD: A Labeled Visual and Thermal Image Dataset for Wilderness Search + and Rescue + + +
+ Sensor-equipped unoccupied aerial vehicles (UAVs) have the potential to help +reduce search times and alleviate safety risks for first responders carrying +out Wilderness Search and Rescue (WiSAR) operations, the process of finding and +rescuing person(s) lost in wilderness areas. Unfortunately, visual sensors +alone do not address the need for robustness across all the possible terrains, +weather, and lighting conditions that WiSAR operations can be conducted in. The +use of multi-modal sensors, specifically visual-thermal cameras, is critical in +enabling WiSAR UAVs to perform in diverse operating conditions. However, due to +the unique challenges posed by the wilderness context, existing dataset +benchmarks are inadequate for developing vision-based algorithms for autonomous +WiSAR UAVs. To this end, we present WiSARD, a dataset with roughly 56,000 +labeled visual and thermal images collected from UAV flights in various +terrains, seasons, weather, and lighting conditions. To the best of our +knowledge, WiSARD is the first large-scale dataset collected with multi-modal +sensors for autonomous WiSAR operations. We envision that our dataset will +provide researchers with a diverse and challenging benchmark that can test the +robustness of their algorithms when applied to real-world (life-saving) +applications. + +
+
+
+
+
+ + ☆ Demographic Disparities in 1-to-Many Facial Identification + + +
+ Most studies to date that have examined demographic variations in face +recognition accuracy have analyzed 1-to-1 matching accuracy, using images that +could be described as "government ID quality". This paper analyzes the accuracy +of 1-to-many facial identification across demographic groups, and in the +presence of blur and reduced resolution in the probe image as might occur in +"surveillance camera quality" images. Cumulative match characteristic +curves(CMC) are not appropriate for comparing propensity for rank-one +recognition errors across demographics, and so we introduce three metrics for +this: (1) d' metric between mated and non-mated score distributions, (2) +absolute score difference between thresholds in the high-similarity tail of the +non-mated and the low-similarity tail of the mated distribution, and (3) +distribution of (mated - non-mated rank one scores) across the set of probe +images. We find that demographic variation in 1-to-many accuracy does not +entirely follow what has been observed in 1-to-1 matching accuracy. Also, +different from 1-to-1 accuracy, demographic comparison of 1-to-many accuracy +can be affected by different numbers of identities and images across +demographics. Finally, we show that increased blur in the probe image, or +reduced resolution of the face in the probe image, can significantly increase +the false positive identification rate. And we show that the demographic +variation in these high blur or low resolution conditions is much larger for +male/ female than for African-American / Caucasian. The point that 1-to-many +accuracy can potentially collapse in the context of processing "surveillance +camera quality" probe images against a "government ID quality" gallery is an +important one. + +
+
+ comment: 9 pages, 8 figures, Conference submission +
+
+
+
+
+ + ☆ Comparative Study of Visual SLAM-Based Mobile Robot Localization Using + Fiducial Markers IROS + + +
+ This paper presents a comparative study of three modes for mobile robot +localization based on visual SLAM using fiducial markers (i.e., square-shaped +artificial landmarks with a black-and-white grid pattern): SLAM, SLAM with a +prior map, and localization with a prior map. The reason for comparing the +SLAM-based approaches leveraging fiducial markers is because previous work has +shown their superior performance over feature-only methods, with less +computational burden compared to methods that use both feature and marker +detection without compromising the localization performance. The evaluation is +conducted using indoor image sequences captured with a hand-held camera +containing multiple fiducial markers in the environment. The performance +metrics include absolute trajectory error and runtime for the optimization +process per frame. In particular, for the last two modes (SLAM and localization +with a prior map), we evaluate their performances by perturbing the quality of +prior map to study the extent to which each mode is tolerant to such +perturbations. Hardware experiments show consistent trajectory error levels +across the three modes, with the localization mode exhibiting the shortest +runtime among them. Yet, with map perturbations, SLAM with a prior map +maintains performance, while localization mode degrades in both aspects. + +
+
+ comment: IEEE 2023 IROS Workshop "Closing the Loop on Localization". For more + information, see https://oravus.github.io/vpr-workshop/index +
+
+
+
+
+ + ☆ Single View Refractive Index Tomography with Neural Fields + + +
+ Refractive Index Tomography is an inverse problem in which we seek to +reconstruct a scene's 3D refractive field from 2D projected image measurements. +The refractive field is not visible itself, but instead affects how the path of +a light ray is continuously curved as it travels through space. Refractive +fields appear across a wide variety of scientific applications, from +translucent cell samples in microscopy to fields of dark matter bending light +from faraway galaxies. This problem poses a unique challenge because the +refractive field directly affects the path that light takes, making its +recovery a non-linear problem. In addition, in contrast with traditional +tomography, we seek to recover the refractive field using a projected image +from only a single viewpoint by leveraging knowledge of light sources scattered +throughout the medium. In this work, we introduce a method that uses a +coordinate-based neural network to model the underlying continuous refractive +field in a scene. We then use explicit modeling of rays' 3D spatial curvature +to optimize the parameters of this network, reconstructing refractive fields +with an analysis-by-synthesis approach. The efficacy of our approach is +demonstrated by recovering refractive fields in simulation, and analyzing how +recovery is affected by the light source distribution. We then test our method +on a simulated dark matter mapping problem, where we recover the refractive +field underlying a realistic simulated dark matter distribution. + +
+
+
+
+
+ + ☆ Create Your World: Lifelong Text-to-Image Diffusion + + +
+ Text-to-image generative models can produce diverse high-quality images of +concepts with a text prompt, which have demonstrated excellent ability in image +generation, image translation, etc. We in this work study the problem of +synthesizing instantiations of a use's own concepts in a never-ending manner, +i.e., create your world, where the new concepts from user are quickly learned +with a few examples. To achieve this goal, we propose a Lifelong text-to-image +Diffusion Model (L2DM), which intends to overcome knowledge "catastrophic +forgetting" for the past encountered concepts, and semantic "catastrophic +neglecting" for one or more concepts in the text prompt. In respect of +knowledge "catastrophic forgetting", our L2DM framework devises a task-aware +memory enhancement module and a elastic-concept distillation module, which +could respectively safeguard the knowledge of both prior concepts and each past +personalized concept. When generating images with a user text prompt, the +solution to semantic "catastrophic neglecting" is that a concept attention +artist module can alleviate the semantic neglecting from concept aspect, and an +orthogonal attention module can reduce the semantic binding from attribute +aspect. To the end, our model can generate more faithful image across a range +of continual text prompts in terms of both qualitative and quantitative +metrics, when comparing with the related state-of-the-art models. The code will +be released at https://wenqiliang.github.io/. + +
+
+ comment: 15 pages,10 figures +
+
+
+
+
+ + ☆ Video Task Decathlon: Unifying Image and Video Tasks in Autonomous + Driving ICCV 2023 + + +
+ Performing multiple heterogeneous visual tasks in dynamic scenes is a +hallmark of human perception capability. Despite remarkable progress in image +and video recognition via representation learning, current research still +focuses on designing specialized networks for singular, homogeneous, or simple +combination of tasks. We instead explore the construction of a unified model +for major image and video recognition tasks in autonomous driving with diverse +input and output structures. To enable such an investigation, we design a new +challenge, Video Task Decathlon (VTD), which includes ten representative image +and video tasks spanning classification, segmentation, localization, and +association of objects and pixels. On VTD, we develop our unified network, +VTDNet, that uses a single structure and a single set of weights for all ten +tasks. VTDNet groups similar tasks and employs task interaction stages to +exchange information within and between task groups. Given the impracticality +of labeling all tasks on all frames, and the performance degradation associated +with joint training of many tasks, we design a Curriculum training, +Pseudo-labeling, and Fine-tuning (CPF) scheme to successfully train VTDNet on +all tasks and mitigate performance loss. Armed with CPF, VTDNet significantly +outperforms its single-task counterparts on most tasks with only 20% overall +computations. VTD is a promising new direction for exploring the unification of +perception tasks in autonomous driving. + +
+
+ comment: ICCV 2023, project page at https://www.vis.xyz/pub/vtd +
+
+
+
+
+ + ☆ SynthoGestures: A Novel Framework for Synthetic Dynamic Hand Gesture + Generation for Driving Scenarios + + +
+ Creating a diverse and comprehensive dataset of hand gestures for dynamic +human-machine interfaces in the automotive domain can be challenging and +time-consuming. To overcome this challenge, we propose using synthetic gesture +datasets generated by virtual 3D models. Our framework utilizes Unreal Engine +to synthesize realistic hand gestures, offering customization options and +reducing the risk of overfitting. Multiple variants, including gesture speed, +performance, and hand shape, are generated to improve generalizability. In +addition, we simulate different camera locations and types, such as RGB, +infrared, and depth cameras, without incurring additional time and cost to +obtain these cameras. Experimental results demonstrate that our proposed +framework, +SynthoGestures\footnote{\url{https://github.com/amrgomaaelhady/SynthoGestures}}, +improves gesture recognition accuracy and can replace or augment real-hand +datasets. By saving time and effort in the creation of the data set, our tool +accelerates the development of gesture recognition systems for automotive +applications. + +
+
+ comment: Shorter versions are accepted as AutomotiveUI2023 Work in Progress + and UIST2023 Poster Papers +
+
+
+
+
+ + ☆ DeformToon3D: Deformable 3D Toonification from Neural Radiance Fields ICCV 2023 + + +
+ In this paper, we address the challenging problem of 3D toonification, which +involves transferring the style of an artistic domain onto a target 3D face +with stylized geometry and texture. Although fine-tuning a pre-trained 3D GAN +on the artistic domain can produce reasonable performance, this strategy has +limitations in the 3D domain. In particular, fine-tuning can deteriorate the +original GAN latent space, which affects subsequent semantic editing, and +requires independent optimization and storage for each new style, limiting +flexibility and efficient deployment. To overcome these challenges, we propose +DeformToon3D, an effective toonification framework tailored for hierarchical 3D +GAN. Our approach decomposes 3D toonification into subproblems of geometry and +texture stylization to better preserve the original latent space. Specifically, +we devise a novel StyleField that predicts conditional 3D deformation to align +a real-space NeRF to the style space for geometry stylization. Thanks to the +StyleField formulation, which already handles geometry stylization well, +texture stylization can be achieved conveniently via adaptive style mixing that +injects information of the artistic domain into the decoder of the pre-trained +3D GAN. Due to the unique design, our method enables flexible style degree +control and shape-texture-specific style swap. Furthermore, we achieve +efficient training without any real-world 2D-3D training pairs but proxy +samples synthesized from off-the-shelf 2D toonification models. + +
+
+ comment: ICCV 2023. Code: https://github.com/junzhezhang/DeformToon3D Project + page: https://www.mmlab-ntu.com/project/deformtoon3d/ +
+
+
+
+
+ + ☆ MaskDiffusion: Boosting Text-to-Image Consistency with Conditional Mask + + +
+ Recent advancements in diffusion models have showcased their impressive +capacity to generate visually striking images. Nevertheless, ensuring a close +match between the generated image and the given prompt remains a persistent +challenge. In this work, we identify that a crucial factor leading to the +text-image mismatch issue is the inadequate cross-modality relation learning +between the prompt and the output image. To better align the prompt and image +content, we advance the cross-attention with an adaptive mask, which is +conditioned on the attention maps and the prompt embeddings, to dynamically +adjust the contribution of each text token to the image features. This +mechanism explicitly diminishes the ambiguity in semantic information embedding +from the text encoder, leading to a boost of text-to-image consistency in the +synthesized images. Our method, termed MaskDiffusion, is training-free and +hot-pluggable for popular pre-trained diffusion models. When applied to the +latent diffusion models, our MaskDiffusion can significantly improve the +text-to-image consistency with negligible computation overhead compared to the +original diffusion models. + +
+
+
+
+
+ + ☆ Language Prompt for Autonomous Driving + + +
+ A new trend in the computer vision community is to capture objects of +interest following flexible human command represented by a natural language +prompt. However, the progress of using language prompts in driving scenarios is +stuck in a bottleneck due to the scarcity of paired prompt-instance data. To +address this challenge, we propose the first object-centric language prompt set +for driving scenes within 3D, multi-view, and multi-frame space, named +NuPrompt. It expands Nuscenes dataset by constructing a total of 35,367 +language descriptions, each referring to an average of 5.3 object tracks. Based +on the object-text pairs from the new benchmark, we formulate a new +prompt-based driving task, \ie, employing a language prompt to predict the +described object trajectory across views and frames. Furthermore, we provide a +simple end-to-end baseline model based on Transformer, named PromptTrack. +Experiments show that our PromptTrack achieves impressive performance on +NuPrompt. We hope this work can provide more new insights for the autonomous +driving community. Dataset and Code will be made public at +\href{https://github.com/wudongming97/Prompt4Driving}{https://github.com/wudongming97/Prompt4Driving}. + +
+
+
+
+
+ + ☆ MoEController: Instruction-based Arbitrary Image Manipulation with + Mixture-of-Expert Controllers + + +
+ Diffusion-model-based text-guided image generation has recently made +astounding progress, producing fascinating results in open-domain image +manipulation tasks. Few models, however, currently have complete zero-shot +capabilities for both global and local image editing due to the complexity and +diversity of image manipulation tasks. In this work, we propose a method with a +mixture-of-expert (MOE) controllers to align the text-guided capacity of +diffusion models with different kinds of human instructions, enabling our model +to handle various open-domain image manipulation tasks with natural language +instructions. First, we use large language models (ChatGPT) and conditional +image synthesis models (ControlNet) to generate a large number of global image +transfer dataset in addition to the instruction-based local image editing +dataset. Then, using an MOE technique and task-specific adaptation training on +a large-scale dataset, our conditional diffusion model can edit images globally +and locally. Extensive experiments demonstrate that our approach performs +surprisingly well on various image manipulation tasks when dealing with +open-domain images and arbitrary human instructions. Please refer to our +project page: [https://oppo-mente-lab.github.io/moe_controller/] + +
+
+ comment: 5 pages,6 figures +
+
+
+
+
+ + ☆ CNN Injected Transformer for Image Exposure Correction + + +
+ Capturing images with incorrect exposure settings fails to deliver a +satisfactory visual experience. Only when the exposure is properly set, can the +color and details of the images be appropriately preserved. Previous exposure +correction methods based on convolutions often produce exposure deviation in +images as a consequence of the restricted receptive field of convolutional +kernels. This issue arises because convolutions are not capable of capturing +long-range dependencies in images accurately. To overcome this challenge, we +can apply the Transformer to address the exposure correction problem, +leveraging its capability in modeling long-range dependencies to capture global +representation. However, solely relying on the window-based Transformer leads +to visually disturbing blocking artifacts due to the application of +self-attention in small patches. In this paper, we propose a CNN Injected +Transformer (CIT) to harness the individual strengths of CNN and Transformer +simultaneously. Specifically, we construct the CIT by utilizing a window-based +Transformer to exploit the long-range interactions among different regions in +the entire image. Within each CIT block, we incorporate a channel attention +block (CAB) and a half-instance normalization block (HINB) to assist the +window-based self-attention to acquire the global statistics and refine local +features. In addition to the hybrid architecture design for exposure +correction, we apply a set of carefully formulated loss functions to improve +the spatial coherence and rectify potential color deviations. Extensive +experiments demonstrate that our image exposure correction method outperforms +state-of-the-art approaches in terms of both quantitative and qualitative +metrics. + +
+
+
+
+
+ + ☆ SSIG: A Visually-Guided Graph Edit Distance for Floor Plan Similarity ICCV + + +
+ We propose a simple yet effective metric that measures structural similarity +between visual instances of architectural floor plans, without the need for +learning. Qualitatively, our experiments show that the retrieval results are +similar to deeply learned methods. Effectively comparing instances of floor +plan data is paramount to the success of machine understanding of floor plan +data, including the assessment of floor plan generative models and floor plan +recommendation systems. Comparing visual floor plan images goes beyond a sole +pixel-wise visual examination and is crucially about similarities and +differences in the shapes and relations between subdivisions that compose the +layout. Currently, deep metric learning approaches are used to learn a +pair-wise vector representation space that closely mimics the structural +similarity, in which the models are trained on similarity labels that are +obtained by Intersection-over-Union (IoU). To compensate for the lack of +structural awareness in IoU, graph-based approaches such as Graph Matching +Networks (GMNs) are used, which require pairwise inference for comparing data +instances, making GMNs less practical for retrieval applications. In this +paper, an effective evaluation metric for judging the structural similarity of +floor plans, coined SSIG (Structural Similarity by IoU and GED), is proposed +based on both image and graph distances. In addition, an efficient algorithm is +developed that uses SSIG to rank a large-scale floor plan database. Code will +be openly available. + +
+
+ comment: To be published in ICCVW 2023, 10 pages +
+
+
+
+
+ + ☆ Mobile V-MoEs: Scaling Down Vision Transformers via Sparse + Mixture-of-Experts + + +
+ Sparse Mixture-of-Experts models (MoEs) have recently gained popularity due +to their ability to decouple model size from inference efficiency by only +activating a small subset of the model parameters for any given input token. As +such, sparse MoEs have enabled unprecedented scalability, resulting in +tremendous successes across domains such as natural language processing and +computer vision. In this work, we instead explore the use of sparse MoEs to +scale-down Vision Transformers (ViTs) to make them more attractive for +resource-constrained vision applications. To this end, we propose a simplified +and mobile-friendly MoE design where entire images rather than individual +patches are routed to the experts. We also propose a stable MoE training +procedure that uses super-class information to guide the router. We empirically +show that our sparse Mobile Vision MoEs (V-MoEs) can achieve a better trade-off +between performance and efficiency than the corresponding dense ViTs. For +example, for the ViT-Tiny model, our Mobile V-MoE outperforms its dense +counterpart by 3.39% on ImageNet-1k. For an even smaller ViT variant with only +54M FLOPs inference cost, our MoE achieves an improvement of 4.66%. + +
+
+
+
+
+ + ☆ Revealing the preference for correcting separated aberrations in joint + optic-image design + + +
+ The joint design of the optical system and the downstream algorithm is a +challenging and promising task. Due to the demand for balancing the global +optimal of imaging systems and the computational cost of physical simulation, +existing methods cannot achieve efficient joint design of complex systems such +as smartphones and drones. In this work, starting from the perspective of the +optical design, we characterize the optics with separated aberrations. +Additionally, to bridge the hardware and software without gradients, an image +simulation system is presented to reproduce the genuine imaging procedure of +lenses with large field-of-views. As for aberration correction, we propose a +network to perceive and correct the spatially varying aberrations and validate +its superiority over state-of-the-art methods. Comprehensive experiments reveal +that the preference for correcting separated aberrations in joint design is as +follows: longitudinal chromatic aberration, lateral chromatic aberration, +spherical aberration, field curvature, and coma, with astigmatism coming last. +Drawing from the preference, a 10% reduction in the total track length of the +consumer-level mobile phone lens module is accomplished. Moreover, this +procedure spares more space for manufacturing deviations, realizing +extreme-quality enhancement of computational photography. The optimization +paradigm provides innovative insight into the practical joint design of +sophisticated optical systems and post-processing algorithms. + +
+
+
+
+
+ + ☆ Leveraging Model Fusion for Improved License Plate Recognition + + +
+ License Plate Recognition (LPR) plays a critical role in various +applications, such as toll collection, parking management, and traffic law +enforcement. Although LPR has witnessed significant advancements through the +development of deep learning, there has been a noticeable lack of studies +exploring the potential improvements in results by fusing the outputs from +multiple recognition models. This research aims to fill this gap by +investigating the combination of up to 12 different models using +straightforward approaches, such as selecting the most confident prediction or +employing majority vote-based strategies. Our experiments encompass a wide +range of datasets, revealing substantial benefits of fusion approaches in both +intra- and cross-dataset setups. Essentially, fusing multiple models reduces +considerably the likelihood of obtaining subpar performance on a particular +dataset/scenario. We also found that combining models based on their speed is +an appealing approach. Specifically, for applications where the recognition +task can tolerate some additional time, though not excessively, an effective +strategy is to combine 4-6 models. These models may not be the most accurate +individually, but their fusion strikes an optimal balance between accuracy and +speed. + +
+
+ comment: Accepted for presentation at the Iberoamerican Congress on Pattern + Recognition (CIARP) 2023 +
+
+
+
+
+ + ☆ AMLP:Adaptive Masking Lesion Patches for Self-supervised Medical Image + Segmentation + + +
+ Self-supervised masked image modeling has shown promising results on natural +images. However, directly applying such methods to medical images remains +challenging. This difficulty stems from the complexity and distinct +characteristics of lesions compared to natural images, which impedes effective +representation learning. Additionally, conventional high fixed masking ratios +restrict reconstructing fine lesion details, limiting the scope of learnable +information. To tackle these limitations, we propose a novel self-supervised +medical image segmentation framework, Adaptive Masking Lesion Patches (AMLP). +Specifically, we design a Masked Patch Selection (MPS) strategy to identify and +focus learning on patches containing lesions. Lesion regions are scarce yet +critical, making their precise reconstruction vital. To reduce +misclassification of lesion and background patches caused by unsupervised +clustering in MPS, we introduce an Attention Reconstruction Loss (ARL) to focus +on hard-to-reconstruct patches likely depicting lesions. We further propose a +Category Consistency Loss (CCL) to refine patch categorization based on +reconstruction difficulty, strengthening distinction between lesions and +background. Moreover, we develop an Adaptive Masking Ratio (AMR) strategy that +gradually increases the masking ratio to expand reconstructible information and +improve learning. Extensive experiments on two medical segmentation datasets +demonstrate AMLP's superior performance compared to existing self-supervised +approaches. The proposed strategies effectively address limitations in applying +masked modeling to medical images, tailored to capturing fine lesion details +vital for segmentation tasks. + +
+
+
+
+
+ + ☆ Have We Ever Encountered This Before? Retrieving Out-of-Distribution + Road Obstacles from Driving Scenes + + +
+ In the life cycle of highly automated systems operating in an open and +dynamic environment, the ability to adjust to emerging challenges is crucial. +For systems integrating data-driven AI-based components, rapid responses to +deployment issues require fast access to related data for testing and +reconfiguration. In the context of automated driving, this especially applies +to road obstacles that were not included in the training data, commonly +referred to as out-of-distribution (OoD) road obstacles. Given the availability +of large uncurated recordings of driving scenes, a pragmatic approach is to +query a database to retrieve similar scenarios featuring the same safety +concerns due to OoD road obstacles. In this work, we extend beyond identifying +OoD road obstacles in video streams and offer a comprehensive approach to +extract sequences of OoD road obstacles using text queries, thereby proposing a +way of curating a collection of OoD data for subsequent analysis. Our proposed +method leverages the recent advances in OoD segmentation and multi-modal +foundation models to identify and efficiently extract safety-relevant scenes +from unlabeled videos. We present a first approach for the novel task of +text-based OoD object retrieval, which addresses the question ''Have we ever +encountered this before?''. + +
+
+ comment: 11 pages, 7 figures, and 3 tables +
+
+
+
+
+ + ☆ How Can We Tame the Long-Tail of Chest X-ray Datasets? + + +
+ Chest X-rays (CXRs) are a medical imaging modality that is used to infer a +large number of abnormalities. While it is hard to define an exhaustive list of +these abnormalities, which may co-occur on a chest X-ray, few of them are quite +commonly observed and are abundantly represented in CXR datasets used to train +deep learning models for automated inference. However, it is challenging for +current models to learn independent discriminatory features for labels that are +rare but may be of high significance. Prior works focus on the combination of +multi-label and long tail problems by introducing novel loss functions or some +mechanism of re-sampling or re-weighting the data. Instead, we propose that it +is possible to achieve significant performance gains merely by choosing an +initialization for a model that is closer to the domain of the target dataset. +This method can complement the techniques proposed in existing literature, and +can easily be scaled to new labels. Finally, we also examine the veracity of +synthetically generated data to augment the tail labels and analyse its +contribution to improving model performance. + +
+
+ comment: Extended Abstract presented at Computer Vision for Automated Medical + Diagnosis Workshop at the International Conference on Computer Vision 2023, + October 2nd 2023, Paris, France, & Virtual, https://cvamd2023.github.io, 7 + pages +
+
+
+
+
+ + ☆ Towards Practical Capture of High-Fidelity Relightable Avatars SIGGRAPH + + +
+ In this paper, we propose a novel framework, Tracking-free Relightable Avatar +(TRAvatar), for capturing and reconstructing high-fidelity 3D avatars. Compared +to previous methods, TRAvatar works in a more practical and efficient setting. +Specifically, TRAvatar is trained with dynamic image sequences captured in a +Light Stage under varying lighting conditions, enabling realistic relighting +and real-time animation for avatars in diverse scenes. Additionally, TRAvatar +allows for tracking-free avatar capture and obviates the need for accurate +surface tracking under varying illumination conditions. Our contributions are +two-fold: First, we propose a novel network architecture that explicitly builds +on and ensures the satisfaction of the linear nature of lighting. Trained on +simple group light captures, TRAvatar can predict the appearance in real-time +with a single forward pass, achieving high-quality relighting effects under +illuminations of arbitrary environment maps. Second, we jointly optimize the +facial geometry and relightable appearance from scratch based on image +sequences, where the tracking is implicitly learned. This tracking-free +approach brings robustness for establishing temporal correspondences between +frames under different lighting conditions. Extensive qualitative and +quantitative experiments demonstrate that our framework achieves superior +performance for photorealistic avatar animation and relighting. + +
+
+ comment: Accepted to SIGGRAPH Asia 2023 (Conference); Project page: + https://travatar-paper.github.io/ +
+
+
+
+
+ + ☆ FIVA: Facial Image and Video Anonymization and Anonymization Defense ICCV + + +
+ In this paper, we present a new approach for facial anonymization in images +and videos, abbreviated as FIVA. Our proposed method is able to maintain the +same face anonymization consistently over frames with our suggested +identity-tracking and guarantees a strong difference from the original face. +FIVA allows for 0 true positives for a false acceptance rate of 0.001. Our work +considers the important security issue of reconstruction attacks and +investigates adversarial noise, uniform noise, and parameter noise to disrupt +reconstruction attacks. In this regard, we apply different defense and +protection methods against these privacy threats to demonstrate the scalability +of FIVA. On top of this, we also show that reconstruction attack models can be +used for detection of deep fakes. Last but not least, we provide experimental +results showing how FIVA can even enable face swapping, which is purely trained +on a single target image. + +
+
+ comment: Accepted to ICCVW 2023 - DFAD 2023 +
+
+
+
+
+ + ☆ Long-Range Correlation Supervision for Land-Cover Classification from + Remote Sensing Images + + +
+ Long-range dependency modeling has been widely considered in modern deep +learning based semantic segmentation methods, especially those designed for +large-size remote sensing images, to compensate the intrinsic locality of +standard convolutions. However, in previous studies, the long-range dependency, +modeled with an attention mechanism or transformer model, has been based on +unsupervised learning, instead of explicit supervision from the objective +ground truth. In this paper, we propose a novel supervised long-range +correlation method for land-cover classification, called the supervised +long-range correlation network (SLCNet), which is shown to be superior to the +currently used unsupervised strategies. In SLCNet, pixels sharing the same +category are considered highly correlated and those having different categories +are less relevant, which can be easily supervised by the category consistency +information available in the ground truth semantic segmentation map. Under such +supervision, the recalibrated features are more consistent for pixels of the +same category and more discriminative for pixels of other categories, +regardless of their proximity. To complement the detailed information lacking +in the global long-range correlation, we introduce an auxiliary adaptive +receptive field feature extraction module, parallel to the long-range +correlation module in the encoder, to capture finely detailed feature +representations for multi-size objects in multi-scale remote sensing images. In +addition, we apply multi-scale side-output supervision and a hybrid loss +function as local and global constraints to further boost the segmentation +accuracy. Experiments were conducted on three remote sensing datasets. Compared +with the advanced segmentation methods from the computer vision, medicine, and +remote sensing communities, the SLCNet achieved a state-of-the-art performance +on all the datasets. + +
+
+ comment: 14 pages, 11 figures +
+
+
+
+
+ + ☆ Score-PA: Score-based 3D Part Assembly BMVC 2023 + + +
+ Autonomous 3D part assembly is a challenging task in the areas of robotics +and 3D computer vision. This task aims to assemble individual components into a +complete shape without relying on predefined instructions. In this paper, we +formulate this task from a novel generative perspective, introducing the +Score-based 3D Part Assembly framework (Score-PA) for 3D part assembly. Knowing +that score-based methods are typically time-consuming during the inference +stage. To address this issue, we introduce a novel algorithm called the Fast +Predictor-Corrector Sampler (FPC) that accelerates the sampling process within +the framework. We employ various metrics to assess assembly quality and +diversity, and our evaluation results demonstrate that our algorithm +outperforms existing state-of-the-art approaches. We release our code at +https://github.com/J-F-Cheng/Score-PA_Score-based-3D-Part-Assembly. + +
+
+ comment: BMVC 2023 +
+
+
+
+
+ + ☆ SegmentAnything helps microscopy images based automatic and quantitative + organoid detection and analysis SP + + +
+ Organoids are self-organized 3D cell clusters that closely mimic the +architecture and function of in vivo tissues and organs. Quantification of +organoid morphology helps in studying organ development, drug discovery, and +toxicity assessment. Recent microscopy techniques provide a potent tool to +acquire organoid morphology features, but manual image analysis remains a labor +and time-intensive process. Thus, this paper proposes a comprehensive pipeline +for microscopy analysis that leverages the SegmentAnything to precisely +demarcate individual organoids. Additionally, we introduce a set of +morphological properties, including perimeter, area, radius, non-smoothness, +and non-circularity, allowing researchers to analyze the organoid structures +quantitatively and automatically. To validate the effectiveness of our +approach, we conducted tests on bright-field images of human induced +pluripotent stem cells (iPSCs) derived neural-epithelial (NE) organoids. The +results obtained from our automatic pipeline closely align with manual organoid +detection and measurement, showcasing the capability of our proposed method in +accelerating organoids morphology analysis. + +
+
+ comment: submitted to SPIE: Medical Imaging 2024 +
+
+
+
+
+ + ☆ Stereo Matching in Time: 100+ FPS Video Stereo Matching for Extended + Reality + + +
+ Real-time Stereo Matching is a cornerstone algorithm for many Extended +Reality (XR) applications, such as indoor 3D understanding, video pass-through, +and mixed-reality games. Despite significant advancements in deep stereo +methods, achieving real-time depth inference with high accuracy on a low-power +device remains a major challenge. One of the major difficulties is the lack of +high-quality indoor video stereo training datasets captured by head-mounted +VR/AR glasses. To address this issue, we introduce a novel video stereo +synthetic dataset that comprises photorealistic renderings of various indoor +scenes and realistic camera motion captured by a 6-DoF moving VR/AR +head-mounted display (HMD). This facilitates the evaluation of existing +approaches and promotes further research on indoor augmented reality scenarios. +Our newly proposed dataset enables us to develop a novel framework for +continuous video-rate stereo matching. + As another contribution, our dataset enables us to proposed a new video-based +stereo matching approach tailored for XR applications, which achieves real-time +inference at an impressive 134fps on a standard desktop computer, or 30fps on a +battery-powered HMD. Our key insight is that disparity and contextual +information are highly correlated and redundant between consecutive stereo +frames. By unrolling an iterative cost aggregation in time (i.e. in the +temporal dimension), we are able to distribute and reuse the aggregated +features over time. This approach leads to a substantial reduction in +computation without sacrificing accuracy. We conducted extensive evaluations +and comparisons and demonstrated that our method achieves superior performance +compared to the current state-of-the-art, making it a strong contender for +real-time stereo matching in VR/AR applications. + +
+
+
+
+
+ + ☆ Unsupervised Object Localization with Representer Point Selection ICCV 2023 + + +
+ We propose a novel unsupervised object localization method that allows us to +explain the predictions of the model by utilizing self-supervised pre-trained +models without additional finetuning. Existing unsupervised and self-supervised +object localization methods often utilize class-agnostic activation maps or +self-similarity maps of a pre-trained model. Although these maps can offer +valuable information for localization, their limited ability to explain how the +model makes predictions remains challenging. In this paper, we propose a simple +yet effective unsupervised object localization method based on representer +point selection, where the predictions of the model can be represented as a +linear combination of representer values of training points. By selecting +representer points, which are the most important examples for the model +predictions, our model can provide insights into how the model predicts the +foreground object by providing relevant examples as well as their importance. +Our method outperforms the state-of-the-art unsupervised and self-supervised +object localization methods on various datasets with significant margins and +even outperforms recent weakly supervised and few-shot methods. + +
+
+ comment: Accepted by ICCV 2023 +
+
+
+
+
+ + ☆ PRISTA-Net: Deep Iterative Shrinkage Thresholding Network for Coded + Diffraction Patterns Phase Retrieval + + +
+ The problem of phase retrieval (PR) involves recovering an unknown image from +limited amplitude measurement data and is a challenge nonlinear inverse problem +in computational imaging and image processing. However, many of the PR methods +are based on black-box network models that lack interpretability and +plug-and-play (PnP) frameworks that are computationally complex and require +careful parameter tuning. To address this, we have developed PRISTA-Net, a deep +unfolding network (DUN) based on the first-order iterative shrinkage +thresholding algorithm (ISTA). This network utilizes a learnable nonlinear +transformation to address the proximal-point mapping sub-problem associated +with the sparse priors, and an attention mechanism to focus on phase +information containing image edges, textures, and structures. Additionally, the +fast Fourier transform (FFT) is used to learn global features to enhance local +information, and the designed logarithmic-based loss function leads to +significant improvements when the noise level is low. All parameters in the +proposed PRISTA-Net framework, including the nonlinear transformation, +threshold parameters, and step size, are learned end-to-end instead of being +manually set. This method combines the interpretability of traditional methods +with the fast inference ability of deep learning and is able to handle noise at +each iteration during the unfolding stage, thus improving recovery quality. +Experiments on Coded Diffraction Patterns (CDPs) measurements demonstrate that +our approach outperforms the existing state-of-the-art methods in terms of +qualitative and quantitative evaluations. Our source codes are available at +\emph{https://github.com/liuaxou/PRISTA-Net}. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Grouping Boundary Proposals for Fast Interactive Image Segmentation + + +
+ Geodesic models are known as an efficient tool for solving various image +segmentation problems. Most of existing approaches only exploit local pointwise +image features to track geodesic paths for delineating the objective +boundaries. However, such a segmentation strategy cannot take into account the +connectivity of the image edge features, increasing the risk of shortcut +problem, especially in the case of complicated scenario. In this work, we +introduce a new image segmentation model based on the minimal geodesic +framework in conjunction with an adaptive cut-based circular optimal path +computation scheme and a graph-based boundary proposals grouping scheme. +Specifically, the adaptive cut can disconnect the image domain such that the +target contours are imposed to pass through this cut only once. The boundary +proposals are comprised of precomputed image edge segments, providing the +connectivity information for our segmentation model. These boundary proposals +are then incorporated into the proposed image segmentation model, such that the +target segmentation contours are made up of a set of selected boundary +proposals and the corresponding geodesic paths linking them. Experimental +results show that the proposed model indeed outperforms state-of-the-art +minimal paths-based image segmentation approaches. + +
+
+
+
+
+ + ☆ Context-Aware Prompt Tuning for Vision-Language Model with + Dual-Alignment + + +
+ Large-scale vision-language models (VLMs), e.g., CLIP, learn broad visual +concepts from tedious training data, showing superb generalization ability. +Amount of prompt learning methods have been proposed to efficiently adapt the +VLMs to downstream tasks with only a few training samples. We introduce a novel +method to improve the prompt learning of vision-language models by +incorporating pre-trained large language models (LLMs), called Dual-Aligned +Prompt Tuning (DuAl-PT). Learnable prompts, like CoOp, implicitly model the +context through end-to-end training, which are difficult to control and +interpret. While explicit context descriptions generated by LLMs, like GPT-3, +can be directly used for zero-shot classification, such prompts are overly +relying on LLMs and still underexplored in few-shot domains. With DuAl-PT, we +propose to learn more context-aware prompts, benefiting from both explicit and +implicit context modeling. To achieve this, we introduce a pre-trained LLM to +generate context descriptions, and we encourage the prompts to learn from the +LLM's knowledge by alignment, as well as the alignment between prompts and +local image features. Empirically, DuAl-PT achieves superior performance on 11 +downstream datasets on few-shot recognition and base-to-new generalization. +Hopefully, DuAl-PT can serve as a strong baseline. Code will be available. + +
+
+
+
+
+ + ☆ Mapping EEG Signals to Visual Stimuli: A Deep Learning Approach to Match + vs. Mismatch Classification + + +
+ Existing approaches to modeling associations between visual stimuli and brain +responses are facing difficulties in handling between-subject variance and +model generalization. Inspired by the recent progress in modeling speech-brain +response, we propose in this work a ``match-vs-mismatch'' deep learning model +to classify whether a video clip induces excitatory responses in recorded EEG +signals and learn associations between the visual content and corresponding +neural recordings. Using an exclusive experimental dataset, we demonstrate that +the proposed model is able to achieve the highest accuracy on unseen subjects +as compared to other baseline models. Furthermore, we analyze the inter-subject +noise using a subject-level silhouette score in the embedding space and show +that the developed model is able to mitigate inter-subject noise and +significantly reduce the silhouette score. Moreover, we examine the Grad-CAM +activation score and show that the brain regions associated with language +processing contribute most to the model predictions, followed by regions +associated with visual processing. These results have the potential to +facilitate the development of neural recording-based video reconstruction and +its related applications. + +
+
+
+
+
+ + ☆ Representation Synthesis by Probabilistic Many-Valued Logic Operation in + Self-Supervised Learning + + +
+ Self-supervised learning (SSL) using mixed images has been studied to learn +various image representations. Existing methods using mixed images learn a +representation by maximizing the similarity between the representation of the +mixed image and the synthesized representation of the original images. However, +few methods consider the synthesis of representations from the perspective of +mathematical logic. In this study, we focused on a synthesis method of +representations. We proposed a new SSL with mixed images and a new +representation format based on many-valued logic. This format can indicate the +feature-possession degree, that is, how much of each image feature is possessed +by a representation. This representation format and representation synthesis by +logic operation realize that the synthesized representation preserves the +remarkable characteristics of the original representations. Our method +performed competitively with previous representation synthesis methods for +image classification tasks. We also examined the relationship between the +feature-possession degree and the number of classes of images in the multilabel +image classification dataset to verify that the intended learning was achieved. +In addition, we discussed image retrieval, which is an application of our +proposed representation format using many-valued logic. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Robot Localization and Mapping Final Report -- Sequential Adversarial + Learning for Self-Supervised Deep Visual Odometry + + +
+ Visual odometry (VO) and SLAM have been using multi-view geometry via local +structure from motion for decades. These methods have a slight disadvantage in +challenging scenarios such as low-texture images, dynamic scenarios, etc. +Meanwhile, use of deep neural networks to extract high level features is +ubiquitous in computer vision. For VO, we can use these deep networks to +extract depth and pose estimates using these high level features. The visual +odometry task then can be modeled as an image generation task where the pose +estimation is the by-product. This can also be achieved in a self-supervised +manner, thereby eliminating the data (supervised) intensive nature of training +deep neural networks. Although some works tried the similar approach [1], the +depth and pose estimation in the previous works are vague sometimes resulting +in accumulation of error (drift) along the trajectory. The goal of this work is +to tackle these limitations of past approaches and to develop a method that can +provide better depths and pose estimates. To address this, a couple of +approaches are explored: 1) Modeling: Using optical flow and recurrent neural +networks (RNN) in order to exploit spatio-temporal correlations which can +provide more information to estimate depth. 2) Loss function: Generative +adversarial network (GAN) [2] is deployed to improve the depth estimation (and +thereby pose too), as shown in Figure 1. This additional loss term improves the +realism in generated images and reduces artifacts. + +
+
+
+
+
+ + ☆ Depth Completion with Multiple Balanced Bases and Confidence for Dense + Monocular SLAM + + +
+ Dense SLAM based on monocular cameras does indeed have immense application +value in the field of AR/VR, especially when it is performed on a mobile +device. In this paper, we propose a novel method that integrates a light-weight +depth completion network into a sparse SLAM system using a multi-basis depth +representation, so that dense mapping can be performed online even on a mobile +phone. Specifically, we present a specifically optimized multi-basis depth +completion network, called BBC-Net, tailored to the characteristics of +traditional sparse SLAM systems. BBC-Net can predict multiple balanced bases +and a confidence map from a monocular image with sparse points generated by +off-the-shelf keypoint-based SLAM systems. The final depth is a linear +combination of predicted depth bases that can be optimized by tuning the +corresponding weights. To seamlessly incorporate the weights into traditional +SLAM optimization and ensure efficiency and robustness, we design a set of +depth weight factors, which makes our network a versatile plug-in module, +facilitating easy integration into various existing sparse SLAM systems and +significantly enhancing global depth consistency through bundle adjustment. To +verify the portability of our method, we integrate BBC-Net into two +representative SLAM systems. The experimental results on various datasets show +that the proposed method achieves better performance in monocular dense mapping +than the state-of-the-art methods. We provide an online demo running on a +mobile phone, which verifies the efficiency and mapping quality of the proposed +method in real-world scenarios. + +
+
+
+
+
+ + ☆ From Text to Mask: Localizing Entities Using the Attention of + Text-to-Image Diffusion Models + + +
+ Diffusion models have revolted the field of text-to-image generation +recently. The unique way of fusing text and image information contributes to +their remarkable capability of generating highly text-related images. From +another perspective, these generative models imply clues about the precise +correlation between words and pixels. In this work, a simple but effective +method is proposed to utilize the attention mechanism in the denoising network +of text-to-image diffusion models. Without re-training nor inference-time +optimization, the semantic grounding of phrases can be attained directly. We +evaluate our method on Pascal VOC 2012 and Microsoft COCO 2014 under +weakly-supervised semantic segmentation setting and our method achieves +superior performance to prior methods. In addition, the acquired word-pixel +correlation is found to be generalizable for the learned text embedding of +customized generation methods, requiring only a few modifications. To validate +our discovery, we introduce a new practical task called "personalized referring +image segmentation" with a new dataset. Experiments in various situations +demonstrate the advantages of our method compared to strong baselines on this +task. In summary, our work reveals a novel way to extract the rich multi-modal +knowledge hidden in diffusion models for segmentation. + +
+
+
+
+
+ + ☆ Weakly Supervised Point Clouds Transformer for 3D Object Detection SC + + +
+ The annotation of 3D datasets is required for semantic-segmentation and +object detection in scene understanding. In this paper we present a framework +for the weakly supervision of a point clouds transformer that is used for 3D +object detection. The aim is to decrease the required amount of supervision +needed for training, as a result of the high cost of annotating a 3D datasets. +We propose an Unsupervised Voting Proposal Module, which learns randomly preset +anchor points and uses voting network to select prepared anchor points of high +quality. Then it distills information into student and teacher network. In +terms of student network, we apply ResNet network to efficiently extract local +characteristics. However, it also can lose much global information. To provide +the input which incorporates the global and local information as the input of +student networks, we adopt the self-attention mechanism of transformer to +extract global features, and the ResNet layers to extract region proposals. The +teacher network supervises the classification and regression of the student +network using the pre-trained model on ImageNet. On the challenging KITTI +datasets, the experimental results have achieved the highest level of average +precision compared with the most recent weakly supervised 3D object detectors. + +
+
+ comment: International Conference on Intelligent Transportation Systems + (ITSC), 2022 +
+
+
+
+
+ + ☆ Toward Sufficient Spatial-Frequency Interaction for Gradient-aware + Underwater Image Enhancement + + +
+ Underwater images suffer from complex and diverse degradation, which +inevitably affects the performance of underwater visual tasks. However, most +existing learning-based Underwater image enhancement (UIE) methods mainly +restore such degradations in the spatial domain, and rarely pay attention to +the fourier frequency information. In this paper, we develop a novel UIE +framework based on spatial-frequency interaction and gradient maps, namely +SFGNet, which consists of two stages. Specifically, in the first stage, we +propose a dense spatial-frequency fusion network (DSFFNet), mainly including +our designed dense fourier fusion block and dense spatial fusion block, +achieving sufficient spatial-frequency interaction by cross connections between +these two blocks. In the second stage, we propose a gradient-aware corrector +(GAC) to further enhance perceptual details and geometric structures of images +by gradient map. Experimental results on two real-world underwater image +datasets show that our approach can successfully enhance underwater images, and +achieves competitive performance in visual quality improvement. + +
+
+
+
+
+ + ☆ Towards Efficient SDRTV-to-HDRTV by Learning from Image Formation + + +
+ Modern displays are capable of rendering video content with high dynamic +range (HDR) and wide color gamut (WCG). However, the majority of available +resources are still in standard dynamic range (SDR). As a result, there is +significant value in transforming existing SDR content into the HDRTV standard. +In this paper, we define and analyze the SDRTV-to-HDRTV task by modeling the +formation of SDRTV/HDRTV content. Our analysis and observations indicate that a +naive end-to-end supervised training pipeline suffers from severe gamut +transition errors. To address this issue, we propose a novel three-step +solution pipeline called HDRTVNet++, which includes adaptive global color +mapping, local enhancement, and highlight refinement. The adaptive global color +mapping step uses global statistics as guidance to perform image-adaptive color +mapping. A local enhancement network is then deployed to enhance local details. +Finally, we combine the two sub-networks above as a generator and achieve +highlight consistency through GAN-based joint training. Our method is primarily +designed for ultra-high-definition TV content and is therefore effective and +lightweight for processing 4K resolution images. We also construct a dataset +using HDR videos in the HDR10 standard, named HDRTV1K that contains 1235 and +117 training images and 117 testing images, all in 4K resolution. Besides, we +select five metrics to evaluate the results of SDRTV-to-HDRTV algorithms. Our +final results demonstrate state-of-the-art performance both quantitatively and +visually. The code, model and dataset are available at +https://github.com/xiaom233/HDRTVNet-plus. + +
+
+ comment: Extended version of HDRTVNet +
+
+
+
+
+ + ☆ UER: A Heuristic Bias Addressing Approach for Online Continual Learning ACM MM2023 + + +
+ Online continual learning aims to continuously train neural networks from a +continuous data stream with a single pass-through data. As the most effective +approach, the rehearsal-based methods replay part of previous data. Commonly +used predictors in existing methods tend to generate biased dot-product logits +that prefer to the classes of current data, which is known as a bias issue and +a phenomenon of forgetting. Many approaches have been proposed to overcome the +forgetting problem by correcting the bias; however, they still need to be +improved in online fashion. In this paper, we try to address the bias issue by +a more straightforward and more efficient method. By decomposing the +dot-product logits into an angle factor and a norm factor, we empirically find +that the bias problem mainly occurs in the angle factor, which can be used to +learn novel knowledge as cosine logits. On the contrary, the norm factor +abandoned by existing methods helps remember historical knowledge. Based on +this observation, we intuitively propose to leverage the norm factor to balance +the new and old knowledge for addressing the bias. To this end, we develop a +heuristic approach called unbias experience replay (UER). UER learns current +samples only by the angle factor and further replays previous samples by both +the norm and angle factors. Extensive experiments on three datasets show that +UER achieves superior performance over various state-of-the-art methods. The +code is in https://github.com/FelixHuiweiLin/UER. + +
+
+ comment: 9 pages, 12 figures, ACM MM2023 +
+
+
+
+
+ + ☆ Enhancing Hierarchical Transformers for Whole Brain Segmentation with + Intracranial Measurements Integration + + +
+ Whole brain segmentation with magnetic resonance imaging (MRI) enables the +non-invasive measurement of brain regions, including total intracranial volume +(TICV) and posterior fossa volume (PFV). Enhancing the existing whole brain +segmentation methodology to incorporate intracranial measurements offers a +heightened level of comprehensiveness in the analysis of brain structures. +Despite its potential, the task of generalizing deep learning techniques for +intracranial measurements faces data availability constraints due to limited +manually annotated atlases encompassing whole brain and TICV/PFV labels. In +this paper, we enhancing the hierarchical transformer UNesT for whole brain +segmentation to achieve segmenting whole brain with 133 classes and TICV/PFV +simultaneously. To address the problem of data scarcity, the model is first +pretrained on 4859 T1-weighted (T1w) 3D volumes sourced from 8 different sites. +These volumes are processed through a multi-atlas segmentation pipeline for +label generation, while TICV/PFV labels are unavailable. Subsequently, the +model is finetuned with 45 T1w 3D volumes from Open Access Series Imaging +Studies (OASIS) where both 133 whole brain classes and TICV/PFV labels are +available. We evaluate our method with Dice similarity coefficients(DSC). We +show that our model is able to conduct precise TICV/PFV estimation while +maintaining the 132 brain regions performance at a comparable level. Code and +trained model are available at: https://github.com/MASILab/UNesT/wholebrainSeg. + +
+
+
+
+
+ + ☆ INSURE: An Information Theory Inspired Disentanglement and Purification + Model for Domain Generalization + + +
+ Domain Generalization (DG) aims to learn a generalizable model on the unseen +target domain by only training on the multiple observed source domains. +Although a variety of DG methods have focused on extracting domain-invariant +features, the domain-specific class-relevant features have attracted attention +and been argued to benefit generalization to the unseen target domain. To take +into account the class-relevant domain-specific information, in this paper we +propose an Information theory iNspired diSentanglement and pURification modEl +(INSURE) to explicitly disentangle the latent features to obtain sufficient and +compact (necessary) class-relevant feature for generalization to the unseen +domain. Specifically, we first propose an information theory inspired loss +function to ensure the disentangled class-relevant features contain sufficient +class label information and the other disentangled auxiliary feature has +sufficient domain information. We further propose a paired purification loss +function to let the auxiliary feature discard all the class-relevant +information and thus the class-relevant feature will contain sufficient and +compact (necessary) class-relevant information. Moreover, instead of using +multiple encoders, we propose to use a learnable binary mask as our +disentangler to make the disentanglement more efficient and make the +disentangled features complementary to each other. We conduct extensive +experiments on four widely used DG benchmark datasets including PACS, +OfficeHome, TerraIncognita, and DomainNet. The proposed INSURE outperforms the +state-of-art methods. We also empirically show that domain-specific +class-relevant features are beneficial for domain generalization. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A Conditional Generative Chatbot using Transformer Model + + +
+ A Chatbot serves as a communication tool between a human user and a machine +to achieve an appropriate answer based on the human input. In more recent +approaches, a combination of Natural Language Processing and sequential models +are used to build a generative Chatbot. The main challenge of these models is +their sequential nature, which leads to less accurate results. To tackle this +challenge, in this paper, a novel architecture is proposed using conditional +Wasserstein Generative Adversarial Networks and a transformer model for answer +generation in Chatbots. While the generator of the proposed model consists of a +full transformer model to generate an answer, the discriminator includes only +the encoder part of a transformer model followed by a classifier. To the best +of our knowledge, this is the first time that a generative Chatbot is proposed +using the embedded transformer in both generator and discriminator models. +Relying on the parallel computing of the transformer model, the results of the +proposed model on the Cornell Movie-Dialog corpus and the Chit-Chat datasets +confirm the superiority of the proposed model compared to state-of-the-art +alternatives using different evaluation metrics. + +
+
+
+
+
+ + ♻ ☆ Adaptive Reordering Sampler with Neurally Guided MAGSAC + + +
+ We propose a new sampler for robust estimators that always selects the sample +with the highest probability of consisting only of inliers. After every +unsuccessful iteration, the inlier probabilities are updated in a principled +way via a Bayesian approach. The probabilities obtained by the deep network are +used as prior (so-called neural guidance) inside the sampler. Moreover, we +introduce a new loss that exploits, in a geometrically justifiable manner, the +orientation and scale that can be estimated for any type of feature, e.g., SIFT +or SuperPoint, to estimate two-view geometry. The new loss helps to learn +higher-order information about the underlying scene geometry. Benefiting from +the new sampler and the proposed loss, we combine the neural guidance with the +state-of-the-art MAGSAC++. Adaptive Reordering Sampler with Neurally Guided +MAGSAC (ARS-MAGSAC) is superior to the state-of-the-art in terms of accuracy +and run-time on the PhotoTourism and KITTI datasets for essential and +fundamental matrix estimation. The code and trained models are available at +https://github.com/weitong8591/ars_magsac. + +
+
+
+
+
+ + ♻ ☆ Large Content And Behavior Models To Understand, Simulate, And Optimize + Content And Behavior + + +
+ Shannon, in his seminal paper introducing information theory, divided the +communication into three levels: technical, semantic, and effectivenss. While +the technical level is concerned with accurate reconstruction of transmitted +symbols, the semantic and effectiveness levels deal with the inferred meaning +and its effect on the receiver. Thanks to telecommunications, the first level +problem has produced great advances like the internet. Large Language Models +(LLMs) make some progress towards the second goal, but the third level still +remains largely untouched. The third problem deals with predicting and +optimizing communication for desired receiver behavior. LLMs, while showing +wide generalization capabilities across a wide range of tasks, are unable to +solve for this. One reason for the underperformance could be a lack of +"behavior tokens" in LLMs' training corpora. Behavior tokens define receiver +behavior over a communication, such as shares, likes, clicks, purchases, +retweets, etc. While preprocessing data for LLM training, behavior tokens are +often removed from the corpora as noise. Therefore, in this paper, we make some +initial progress towards reintroducing behavior tokens in LLM training. The +trained models, other than showing similar performance to LLMs on content +understanding tasks, show generalization capabilities on behavior simulation, +content simulation, behavior understanding, and behavior domain adaptation. +Using a wide range of tasks on two corpora, we show results on all these +capabilities. We call these models Large Content and Behavior Models (LCBMs). +Further, to spur more research on LCBMs, we release our new Content Behavior +Corpus (CBC), a repository containing communicator, message, and corresponding +receiver behavior. + +
+
+
+
+
+ + ♻ ☆ Evaluating Deep Learning-based Melanoma Classification using + Immunohistochemistry and Routine Histology: A Three Center Study + + +
+ Pathologists routinely use immunohistochemical (IHC)-stained tissue slides +against MelanA in addition to hematoxylin and eosin (H&E)-stained slides to +improve their accuracy in diagnosing melanomas. The use of diagnostic Deep +Learning (DL)-based support systems for automated examination of tissue +morphology and cellular composition has been well studied in standard +H&E-stained tissue slides. In contrast, there are few studies that analyze IHC +slides using DL. Therefore, we investigated the separate and joint performance +of ResNets trained on MelanA and corresponding H&E-stained slides. The MelanA +classifier achieved an area under receiver operating characteristics curve +(AUROC) of 0.82 and 0.74 on out of distribution (OOD)-datasets, similar to the +H&E-based benchmark classification of 0.81 and 0.75, respectively. A combined +classifier using MelanA and H&E achieved AUROCs of 0.85 and 0.81 on the OOD +datasets. DL MelanA-based assistance systems show the same performance as the +benchmark H&E classification and may be improved by multi stain classification +to assist pathologists in their clinical routine. + +
+
+
+
+
+ + ♻ ☆ Generalized Differentiable RANSAC + + +
+ We propose $\nabla$-RANSAC, a generalized differentiable RANSAC that allows +learning the entire randomized robust estimation pipeline. The proposed +approach enables the use of relaxation techniques for estimating the gradients +in the sampling distribution, which are then propagated through a +differentiable solver. The trainable quality function marginalizes over the +scores from all the models estimated within $\nabla$-RANSAC to guide the +network learning accurate and useful inlier probabilities or to train feature +detection and matching networks. Our method directly maximizes the probability +of drawing a good hypothesis, allowing us to learn better sampling +distributions. We test $\nabla$-RANSAC on various real-world scenarios on +fundamental and essential matrix estimation, and 3D point cloud registration, +outdoors and indoors, with handcrafted and learning-based features. It is +superior to the state-of-the-art in terms of accuracy while running at a +similar speed to its less accurate alternatives. The code and trained models +are available at https://github.com/weitong8591/differentiable_ransac. + +
+
+
+
+
+ + ♻ ☆ MI-SegNet: Mutual Information-Based US Segmentation for Unseen Domain + Generalization MICCAI 2023 + + +
+ Generalization capabilities of learning-based medical image segmentation +across domains are currently limited by the performance degradation caused by +the domain shift, particularly for ultrasound (US) imaging. The quality of US +images heavily relies on carefully tuned acoustic parameters, which vary across +sonographers, machines, and settings. To improve the generalizability on US +images across domains, we propose MI-SegNet, a novel mutual information (MI) +based framework to explicitly disentangle the anatomical and domain feature +representations; therefore, robust domain-independent segmentation can be +expected. Two encoders are employed to extract the relevant features for the +disentanglement. The segmentation only uses the anatomical feature map for its +prediction. In order to force the encoders to learn meaningful feature +representations a cross-reconstruction method is used during training. +Transformations, specific to either domain or anatomy are applied to guide the +encoders in their respective feature extraction task. Additionally, any MI +present in both feature maps is punished to further promote separate feature +spaces. We validate the generalizability of the proposed domain-independent +segmentation approach on several datasets with varying parameters and machines. +Furthermore, we demonstrate the effectiveness of the proposed MI-SegNet serving +as a pre-trained model by comparing it with state-of-the-art networks. + +
+
+ comment: Accepted by MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ SoDaCam: Software-defined Cameras via Single-Photon Imaging ICCV 2023 + + +
+ Reinterpretable cameras are defined by their post-processing capabilities +that exceed traditional imaging. We present "SoDaCam" that provides +reinterpretable cameras at the granularity of photons, from photon-cubes +acquired by single-photon devices. Photon-cubes represent the spatio-temporal +detections of photons as a sequence of binary frames, at frame-rates as high as +100 kHz. We show that simple transformations of the photon-cube, or photon-cube +projections, provide the functionality of numerous imaging systems including: +exposure bracketing, flutter shutter cameras, video compressive systems, event +cameras, and even cameras that move during exposure. Our photon-cube +projections offer the flexibility of being software-defined constructs that are +only limited by what is computable, and shot-noise. We exploit this flexibility +to provide new capabilities for the emulated cameras. As an added benefit, our +projections provide camera-dependent compression of photon-cubes, which we +demonstrate using an implementation of our projections on a novel compute +architecture that is designed for single-photon imaging. + +
+
+ comment: Accepted at ICCV 2023 (oral). Project webpage can be found at + https://wisionlab.com/project/sodacam/ +
+
+
+
+
+ + ♻ ☆ LadleNet: Translating Thermal Infrared Images to Visible Light Images + Using A Scalable Two-stage U-Net + + +
+ The translation of thermal infrared (TIR) images to visible light (VI) images +presents a challenging task with potential applications spanning various +domains such as TIR-VI image registration and fusion. Leveraging supplementary +information derived from TIR image conversions can significantly enhance model +performance and generalization across these applications. However, prevailing +issues within this field include suboptimal image fidelity and limited model +scalability. In this paper, we introduce an algorithm, LadleNet, based on the +U-Net architecture. LadleNet employs a two-stage U-Net concatenation structure, +augmented with skip connections and refined feature aggregation techniques, +resulting in a substantial enhancement in model performance. Comprising +'Handle' and 'Bowl' modules, LadleNet's Handle module facilitates the +construction of an abstract semantic space, while the Bowl module decodes this +semantic space to yield mapped VI images. The Handle module exhibits +extensibility by allowing the substitution of its network architecture with +semantic segmentation networks, thereby establishing more abstract semantic +spaces to bolster model performance. Consequently, we propose LadleNet+, which +replaces LadleNet's Handle module with the pre-trained DeepLabv3+ network, +thereby endowing the model with enhanced semantic space construction +capabilities. The proposed method is evaluated and tested on the KAIST dataset, +accompanied by quantitative and qualitative analyses. Compared to existing +methodologies, our approach achieves state-of-the-art performance in terms of +image clarity and perceptual quality. The source code will be made available at +https://github.com/Ach-1914/LadleNet/tree/main/. + +
+
+
+
+
+ + ♻ ☆ Locomotion-Action-Manipulation: Synthesizing Human-Scene Interactions in + Complex 3D Environments ICCV 2023 + + +
+ Synthesizing interaction-involved human motions has been challenging due to +the high complexity of 3D environments and the diversity of possible human +behaviors within. We present LAMA, Locomotion-Action-MAnipulation, to +synthesize natural and plausible long-term human movements in complex indoor +environments. The key motivation of LAMA is to build a unified framework to +encompass a series of everyday motions including locomotion, scene interaction, +and object manipulation. Unlike existing methods that require motion data +"paired" with scanned 3D scenes for supervision, we formulate the problem as a +test-time optimization by using human motion capture data only for synthesis. +LAMA leverages a reinforcement learning framework coupled with a motion +matching algorithm for optimization, and further exploits a motion editing +framework via manifold learning to cover possible variations in interaction and +manipulation. Throughout extensive experiments, we demonstrate that LAMA +outperforms previous approaches in synthesizing realistic motions in various +challenging scenarios. Project page: https://jiyewise.github.io/projects/LAMA/ . + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Learning a Consensus Sub-Network with Polarization Regularization and + One Pass Training + + +
+ The subject of green AI has been gaining attention within the deep learning +community given the recent trend of ever larger and more complex neural network +models. Existing solutions for reducing the computational load of training at +inference time usually involve pruning the network parameters. Pruning schemes +often create extra overhead either by iterative training and fine-tuning for +static pruning or repeated computation of a dynamic pruning graph. We propose a +new parameter pruning strategy for learning a lighter-weight sub-network that +minimizes the energy cost while maintaining comparable performance to the fully +parameterised network on given downstream tasks. Our proposed pruning scheme is +green-oriented, as it only requires a one-off training to discover the optimal +static sub-networks by dynamic pruning methods. The pruning scheme consists of +a binary gating module and a novel loss function to uncover sub-networks with +user-defined sparsity. Our method enables pruning and training simultaneously, +which saves energy in both the training and inference phases and avoids extra +computational overhead from gating modules at inference time. Our results on +CIFAR-10 and CIFAR-100 suggest that our scheme can remove 50% of connections in +deep networks with less than 1% reduction in classification accuracy. Compared +to other related pruning methods, our method demonstrates a lower drop in +accuracy for equivalent reductions in computational cost. + +
+
+
+
+
+ + ♻ ☆ Revisiting the Encoding of Satellite Image Time Series + + +
+ Satellite Image Time Series (SITS) representation learning is complex due to +high spatiotemporal resolutions, irregular acquisition times, and intricate +spatiotemporal interactions. These challenges result in specialized neural +network architectures tailored for SITS analysis. The field has witnessed +promising results achieved by pioneering researchers, but transferring the +latest advances or established paradigms from Computer Vision (CV) to SITS is +still highly challenging due to the existing suboptimal representation learning +framework. In this paper, we develop a novel perspective of SITS processing as +a direct set prediction problem, inspired by the recent trend in adopting +query-based transformer decoders to streamline the object detection or image +segmentation pipeline. We further propose to decompose the representation +learning process of SITS into three explicit steps: collect-update-distribute, +which is computationally efficient and suits for irregularly-sampled and +asynchronous temporal satellite observations. Facilitated by the unique +reformulation, our proposed temporal learning backbone of SITS, initially +pre-trained on the resource efficient pixel-set format and then fine-tuned on +the downstream dense prediction tasks, has attained new state-of-the-art (SOTA) +results on the PASTIS benchmark dataset. Specifically, the clear separation +between temporal and spatial components in the semantic/panoptic segmentation +pipeline of SITS makes us leverage the latest advances in CV, such as the +universal image segmentation architecture, resulting in a noticeable 2.5 points +increase in mIoU and 8.8 points increase in PQ, respectively, compared to the +best scores reported so far. + +
+
+
+
+
+ + ♻ ☆ Automotive Object Detection via Learning Sparse Events by Spiking + Neurons + + +
+ Event-based sensors, distinguished by their high temporal resolution of +1$\mathrm{\mu s}$ and a dynamic range of 120$\mathrm{dB}$, stand out as ideal +tools for deployment in fast-paced settings like vehicles and drones. +Traditional object detection techniques that utilize Artificial Neural Networks +(ANNs) face challenges due to the sparse and asynchronous nature of the events +these sensors capture. In contrast, Spiking Neural Networks (SNNs) offer a +promising alternative, providing a temporal representation that is inherently +aligned with event-based data. This paper explores the unique membrane +potential dynamics of SNNs and their ability to modulate sparse events. We +introduce an innovative spike-triggered adaptive threshold mechanism designed +for stable training. Building on these insights, we present a specialized +spiking feature pyramid network (SpikeFPN) optimized for automotive event-based +object detection. Comprehensive evaluations demonstrate that SpikeFPN surpasses +both traditional SNNs and advanced ANNs enhanced with attention mechanisms. +Evidently, SpikeFPN achieves a mean Average Precision (mAP) of 0.477 on the +{GEN1 Automotive Detection (GAD)} benchmark dataset, marking a significant +increase of 9.7\% over the previous best SNN. Moreover, the efficient design of +SpikeFPN ensures robust performance while optimizing computational resources, +attributed to its innate sparse computation capabilities. + +
+
+
+
+
+ + ♻ ☆ MAELi: Masked Autoencoder for Large-Scale LiDAR Point Clouds + + +
+ The sensing process of large-scale LiDAR point clouds inevitably causes large +blind spots, i.e. regions not visible to the sensor. We demonstrate how these +inherent sampling properties can be effectively utilized for self-supervised +representation learning by designing a highly effective pre-training framework +that considerably reduces the need for tedious 3D annotations to train +state-of-the-art object detectors. Our Masked AutoEncoder for LiDAR point +clouds (MAELi) intuitively leverages the sparsity of LiDAR point clouds in both +the encoder and decoder during reconstruction. This results in more expressive +and useful initialization, which can be directly applied to downstream +perception tasks, such as 3D object detection or semantic segmentation for +autonomous driving. In a novel reconstruction approach, MAELi distinguishes +between empty and occluded space and employs a new masking strategy that +targets the LiDAR's inherent spherical projection. Thereby, without any ground +truth whatsoever and trained on single frames only, MAELi obtains an +understanding of the underlying 3D scene geometry and semantics. To demonstrate +the potential of MAELi, we pre-train backbones in an end-to-end manner and show +the effectiveness of our unsupervised pre-trained weights on the tasks of 3D +object detection and semantic segmentation. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ FedBEVT: Federated Learning Bird's Eye View Perception Transformer in + Road Traffic Systems + + +
+ Bird's eye view (BEV) perception is becoming increasingly important in the +field of autonomous driving. It uses multi-view camera data to learn a +transformer model that directly projects the perception of the road environment +onto the BEV perspective. However, training a transformer model often requires +a large amount of data, and as camera data for road traffic are often private, +they are typically not shared. Federated learning offers a solution that +enables clients to collaborate and train models without exchanging data but +model parameters. In this paper, we introduce FedBEVT, a federated transformer +learning approach for BEV perception. In order to address two common data +heterogeneity issues in FedBEVT: (i) diverse sensor poses, and (ii) varying +sensor numbers in perception systems, we propose two approaches -- Federated +Learning with Camera-Attentive Personalization (FedCaP) and Adaptive +Multi-Camera Masking (AMCM), respectively. To evaluate our method in real-world +settings, we create a dataset consisting of four typical federated use cases. +Our findings suggest that FedBEVT outperforms the baseline approaches in all +four use cases, demonstrating the potential of our approach for improving BEV +perception in autonomous driving. + +
+
+ comment: Accepted by IEEE T-IV. Code: https://github.com/rruisong/FedBEVT +
+
+
+
+
+ + ♻ ☆ NeTO:Neural Reconstruction of Transparent Objects with Self-Occlusion + Aware Refraction-Tracing + + +
+ We present a novel method, called NeTO, for capturing 3D geometry of solid +transparent objects from 2D images via volume rendering. Reconstructing +transparent objects is a very challenging task, which is ill-suited for +general-purpose reconstruction techniques due to the specular light transport +phenomena. Although existing refraction-tracing based methods, designed +specially for this task, achieve impressive results, they still suffer from +unstable optimization and loss of fine details, since the explicit surface +representation they adopted is difficult to be optimized, and the +self-occlusion problem is ignored for refraction-tracing. In this paper, we +propose to leverage implicit Signed Distance Function (SDF) as surface +representation, and optimize the SDF field via volume rendering with a +self-occlusion aware refractive ray tracing. The implicit representation +enables our method to be capable of reconstructing high-quality reconstruction +even with a limited set of images, and the self-occlusion aware strategy makes +it possible for our method to accurately reconstruct the self-occluded regions. +Experiments show that our method achieves faithful reconstruction results and +outperforms prior works by a large margin. Visit our project page at +https://www.xxlong.site/NeTO/ + +
+
+
+
+
+ + ♻ ☆ A Robust Negative Learning Approach to Partial Domain Adaptation Using + Source Prototypes + + +
+ This work proposes a robust Partial Domain Adaptation (PDA) framework that +mitigates the negative transfer problem by incorporating a robust +target-supervision strategy. It leverages ensemble learning and includes +diverse, complementary label feedback, alleviating the effect of incorrect +feedback and promoting pseudo-label refinement. Rather than relying exclusively +on first-order moments for distribution alignment, our approach offers explicit +objectives to optimize intra-class compactness and inter-class separation with +the inferred source prototypes and highly-confident target samples in a +domain-invariant fashion. Notably, we ensure source data privacy by eliminating +the need to access the source data during the adaptation phase through a priori +inference of source prototypes. We conducted a series of comprehensive +experiments, including an ablation analysis, covering a range of partial domain +adaptation tasks. Comprehensive evaluations on benchmark datasets corroborate +our framework's enhanced robustness and generalization, demonstrating its +superiority over existing state-of-the-art PDA approaches. + +
+
+
+
+
+ + ♻ ☆ Heterogeneous Federated Learning: State-of-the-art and Research + Challenges + + +
+ Federated learning (FL) has drawn increasing attention owing to its potential +use in large-scale industrial applications. Existing federated learning works +mainly focus on model homogeneous settings. However, practical federated +learning typically faces the heterogeneity of data distributions, model +architectures, network environments, and hardware devices among participant +clients. Heterogeneous Federated Learning (HFL) is much more challenging, and +corresponding solutions are diverse and complex. Therefore, a systematic survey +on this topic about the research challenges and state-of-the-art is essential. +In this survey, we firstly summarize the various research challenges in HFL +from five aspects: statistical heterogeneity, model heterogeneity, +communication heterogeneity, device heterogeneity, and additional challenges. +In addition, recent advances in HFL are reviewed and a new taxonomy of existing +HFL methods is proposed with an in-depth analysis of their pros and cons. We +classify existing methods from three different levels according to the HFL +procedure: data-level, model-level, and server-level. Finally, several critical +and promising future research directions in HFL are discussed, which may +facilitate further developments in this field. A periodically updated +collection on HFL is available at https://github.com/marswhu/HFL_Survey. + +
+
+ comment: 42 pages, 11 figures, and 4 tables +
+
+
+
+
+ + ♻ ☆ High Frequency, High Accuracy Pointing onboard Nanosats using + Neuromorphic Event Sensing and Piezoelectric Actuation + + +
+ As satellites become smaller, the ability to maintain stable pointing +decreases as external forces acting on the satellite come into play. At the +same time, reaction wheels used in the attitude determination and control +system (ADCS) introduce high frequency jitter which can disrupt pointing +stability. For space domain awareness (SDA) tasks that track objects tens of +thousands of kilometres away, the pointing accuracy offered by current +nanosats, typically in the range of 10 to 100 arcseconds, is not sufficient. In +this work, we develop a novel payload that utilises a neuromorphic event sensor +(for high frequency and highly accurate relative attitude estimation) paired in +a closed loop with a piezoelectric stage (for active attitude corrections) to +provide highly stable sensor-specific pointing. Event sensors are especially +suited for space applications due to their desirable characteristics of low +power consumption, asynchronous operation, and high dynamic range. We use the +event sensor to first estimate a reference background star field from which +instantaneous relative attitude is estimated at high frequency. The +piezoelectric stage works in a closed control loop with the event sensor to +perform attitude corrections based on the discrepancy between the current and +desired attitude. Results in a controlled setting show that we can achieve a +pointing accuracy in the range of 1-5 arcseconds using our novel payload at an +operating frequency of up to 50Hz using a prototype built from +commercial-off-the-shelf components. Further details can be found at +https://ylatif.github.io/ultrafinestabilisation + +
+
+
+
+
+ + ♻ ☆ Representation Uncertainty in Self-Supervised Learning as Variational + Inference ICCV 2023 + + +
+ In this study, a novel self-supervised learning (SSL) method is proposed, +which considers SSL in terms of variational inference to learn not only +representation but also representation uncertainties. SSL is a method of +learning representations without labels by maximizing the similarity between +image representations of different augmented views of an image. Meanwhile, +variational autoencoder (VAE) is an unsupervised representation learning method +that trains a probabilistic generative model with variational inference. Both +VAE and SSL can learn representations without labels, but their relationship +has not been investigated in the past. Herein, the theoretical relationship +between SSL and variational inference has been clarified. Furthermore, a novel +method, namely variational inference SimSiam (VI-SimSiam), has been proposed. +VI-SimSiam can predict the representation uncertainty by interpreting SimSiam +with variational inference and defining the latent space distribution. The +present experiments qualitatively show that VI- SimSiam could learn uncertainty +by comparing input images and predicted uncertainties. Additionally, we +described a relationship between estimated uncertainty and classification +accuracy. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ COLA: A Benchmark for Compositional Text-to-image Retrieval + + +
+ Compositional reasoning is a hallmark of human visual intelligence; yet +despite the size of large vision-language models, they struggle to represent +simple compositions by combining objects with their attributes. To measure this +lack of compositional capability, we design Cola, a text-to-image retrieval +benchmark to Compose Objects Localized with Attributes. To solve Cola, a model +must retrieve images with the correct configuration of attributes and objects, +and avoid choosing a distractor image with the same objects and attributes but +in the wrong configuration. Cola contains about 1.2k composed queries of 168 +objects and 197 attributes on around 30K images. Our human evaluation finds +that Cola is 83.33% accurate, similar to contemporary compositionality +benchmarks. Using Cola as a testbed, we explore empirical modeling designs to +adapt pre-trained vision-language models to reason compositionally. We explore +6 adaptation strategies on 2 seminal vision-language models, using +compositionality-centric test benchmarks - Cola and CREPE. We find the optimal +adaptation strategy is to train a multimodal attention layer that jointly +attends over the frozen pre-trained image and language features. Surprisingly, +training multimodal layers on CLIP performs better than tuning a larger FLAVA +model with already pre-trained multimodal layers. Furthermore, our adaptation +strategy improves CLIP and FLAVA to comparable levels, suggesting that training +multimodal layers using contrastive attribute-object data is key, as opposed to +using them pre-trained. Lastly, we show that Cola is harder than a closely +related contemporary benchmark, CREPE, since simpler fine-tuning strategies +without multimodal layers suffice on CREPE, but not on Cola. However, we still +see a significant gap between our best adaptation and human accuracy, +suggesting considerable room for further research. + +
+
+ comment: Under review. Webpage: https://github.com/arijitray1993/COLA +
+
+
+
+
+ + ♻ ☆ UNesT: Local Spatial Representation Learning with Hierarchical + Transformer for Efficient Medical Segmentation + + +
+ Transformer-based models, capable of learning better global dependencies, +have recently demonstrated exceptional representation learning capabilities in +computer vision and medical image analysis. Transformer reformats the image +into separate patches and realizes global communication via the self-attention +mechanism. However, positional information between patches is hard to preserve +in such 1D sequences, and loss of it can lead to sub-optimal performance when +dealing with large amounts of heterogeneous tissues of various sizes in 3D +medical image segmentation. Additionally, current methods are not robust and +efficient for heavy-duty medical segmentation tasks such as predicting a large +number of tissue classes or modeling globally inter-connected tissue +structures. To address such challenges and inspired by the nested hierarchical +structures in vision transformer, we proposed a novel 3D medical image +segmentation method (UNesT), employing a simplified and faster-converging +transformer encoder design that achieves local communication among spatially +adjacent patch sequences by aggregating them hierarchically. We extensively +validate our method on multiple challenging datasets, consisting of multiple +modalities, anatomies, and a wide range of tissue classes, including 133 +structures in the brain, 14 organs in the abdomen, 4 hierarchical components in +the kidneys, inter-connected kidney tumors and brain tumors. We show that UNesT +consistently achieves state-of-the-art performance and evaluate its +generalizability and data efficiency. Particularly, the model achieves whole +brain segmentation task complete ROI with 133 tissue classes in a single +network, outperforming prior state-of-the-art method SLANT27 ensembled with 27 +networks. + +
+
+ comment: 19 pages, 17 figures. arXiv admin note: text overlap with + arXiv:2203.02430 +
+
+
+
+
+ + ♻ ☆ Label-efficient Contrastive Learning-based model for nuclei detection + and classification in 3D Cardiovascular Immunofluorescent Images MICCAI + + +
+ Recently, deep learning-based methods achieved promising performance in +nuclei detection and classification applications. However, training deep +learning-based methods requires a large amount of pixel-wise annotated data, +which is time-consuming and labor-intensive, especially in 3D images. An +alternative approach is to adapt weak-annotation methods, such as labeling each +nucleus with a point, but this method does not extend from 2D histopathology +images (for which it was originally developed) to 3D immunofluorescent images. +The reason is that 3D images contain multiple channels (z-axis) for nuclei and +different markers separately, which makes training using point annotations +difficult. To address this challenge, we propose the Label-efficient +Contrastive learning-based (LECL) model to detect and classify various types of +nuclei in 3D immunofluorescent images. Previous methods use Maximum Intensity +Projection (MIP) to convert immunofluorescent images with multiple slices to 2D +images, which can cause signals from different z-stacks to falsely appear +associated with each other. To overcome this, we devised an Extended Maximum +Intensity Projection (EMIP) approach that addresses issues using MIP. +Furthermore, we performed a Supervised Contrastive Learning (SCL) approach for +weakly supervised settings. We conducted experiments on cardiovascular datasets +and found that our proposed framework is effective and efficient in detecting +and classifying various types of nuclei in 3D immunofluorescent images. + +
+
+ comment: 11 pages, 5 figures, MICCAI Workshop Conference 2023 +
+
+
+
+
+ + ♻ ☆ BEVTrack: A Simple Baseline for 3D Single Object Tracking in + Birds's-Eye-View + + +
+ 3D single object tracking (SOT) in point clouds is still a challenging +problem due to appearance variation, distractors, and high sparsity of point +clouds. Notably, in autonomous driving scenarios, the target object typically +maintains spatial adjacency across consecutive frames, predominantly moving +horizontally. This spatial continuity offers valuable prior knowledge for +target localization. However, existing trackers, which often employ point-wise +representations, struggle to efficiently utilize this knowledge owing to the +irregular format of such representations. Consequently, they require elaborate +designs and solving multiple subtasks to establish spatial correspondence. In +this paper, we introduce BEVTrack, a simple yet strong baseline framework for +3D SOT. After converting consecutive point clouds into the common +Bird's-Eye-View representation, BEVTrack inherently encodes spatial proximity +and adeptly captures motion cues for tracking via a simple element-wise +operation and convolutional layers. Additionally, to better deal with objects +having diverse sizes and moving patterns, BEVTrack directly learns the +underlying motion distribution rather than making a fixed Laplacian or Gaussian +assumption as in previous works. Without bells and whistles, BEVTrack achieves +state-of-the-art performance on KITTI and NuScenes datasets while maintaining a +high inference speed of 122 FPS. The code will be released at +https://github.com/xmm-prio/BEVTrack. + +
+
+ comment: Technical report. Work in progress. The code will be released at + https://github.com/xmm-prio/BEVTrack +
+
+
+
+
+ + ♻ ☆ Aerial Diffusion: Text Guided Ground-to-Aerial View Translation from a + Single Image using Diffusion Models + + +
+ We present a novel method, Aerial Diffusion, for generating aerial views from +a single ground-view image using text guidance. Aerial Diffusion leverages a +pretrained text-image diffusion model for prior knowledge. We address two main +challenges corresponding to domain gap between the ground-view and the aerial +view and the two views being far apart in the text-image embedding manifold. +Our approach uses a homography inspired by inverse perspective mapping prior to +finetuning the pretrained diffusion model. Additionally, using the text +corresponding to the ground-view to finetune the model helps us capture the +details in the ground-view image at a relatively low bias towards the +ground-view image. Aerial Diffusion uses an alternating sampling strategy to +compute the optimal solution on complex high-dimensional manifold and generate +a high-fidelity (w.r.t. ground view) aerial image. We demonstrate the quality +and versatility of Aerial Diffusion on a plethora of images from various +domains including nature, human actions, indoor scenes, etc. We qualitatively +prove the effectiveness of our method with extensive ablations and comparisons. +To the best of our knowledge, Aerial Diffusion is the first approach that +performs ground-to-aerial translation in an unsupervised manner. + +
+
+ comment: Code: https://github.com/divyakraman/AerialDiffusion +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Provider Fairness and Beyond-Accuracy Trade-offs in Recommender Systems RecSys 2023 + + +
+ Recommender systems, while transformative in online user experiences, have +raised concerns over potential provider-side fairness issues. These systems may +inadvertently favor popular items, thereby marginalizing less popular ones and +compromising provider fairness. While previous research has recognized +provider-side fairness issues, the investigation into how these biases affect +beyond-accuracy aspects of recommendation systems - such as diversity, novelty, +coverage, and serendipity - has been less emphasized. In this paper, we address +this gap by introducing a simple yet effective post-processing re-ranking model +that prioritizes provider fairness, while simultaneously maintaining user +relevance and recommendation quality. We then conduct an in-depth evaluation of +the model's impact on various aspects of recommendation quality across multiple +datasets. Specifically, we apply the post-processing algorithm to four distinct +recommendation models across four varied domain datasets, assessing the +improvement in each metric, encompassing both accuracy and beyond-accuracy +aspects. This comprehensive analysis allows us to gauge the effectiveness of +our approach in mitigating provider biases. Our findings underscore the +effectiveness of the adopted method in improving provider fairness and +recommendation quality. They also provide valuable insights into the trade-offs +involved in achieving fairness in recommender systems, contributing to a more +nuanced understanding of this complex issue. + +
+
+ comment: FAccTRec at RecSys 2023 +
+
+
+
+
+ + ☆ Offline Recommender System Evaluation under Unobserved Confounding RecSys '23 + + +
+ Off-Policy Estimation (OPE) methods allow us to learn and evaluate +decision-making policies from logged data. This makes them an attractive choice +for the offline evaluation of recommender systems, and several recent works +have reported successful adoption of OPE methods to this end. An important +assumption that makes this work is the absence of unobserved confounders: +random variables that influence both actions and rewards at data collection +time. Because the data collection policy is typically under the practitioner's +control, the unconfoundedness assumption is often left implicit, and its +violations are rarely dealt with in the existing literature. + This work aims to highlight the problems that arise when performing +off-policy estimation in the presence of unobserved confounders, specifically +focusing on a recommendation use-case. We focus on policy-based estimators, +where the logging propensities are learned from logged data. We characterise +the statistical bias that arises due to confounding, and show how existing +diagnostics are unable to uncover such cases. Because the bias depends directly +on the true and unobserved logging propensities, it is non-identifiable. As the +unconfoundedness assumption is famously untestable, this becomes especially +problematic. This paper emphasises this common, yet often overlooked issue. +Through synthetic data, we empirically show how na\"ive propensity estimation +under confounding can lead to severely biased metric estimates that are allowed +to fly under the radar. We aim to cultivate an awareness among researchers and +practitioners of this important problem, and touch upon potential research +directions towards mitigating its effects. + +
+
+ comment: Accepted at the CONSEQUENCES'23 workshop at RecSys '23 +
+
+
+
+
+ + ☆ Receiving an algorithmic recommendation based on documentary filmmaking + techniques + + +
+ This article analyzes the reception of a novel algorithmic recommendation of +documentary films by a panel of moviegoers of the T{\"e}nk platform. In order +to propose an alternative to recommendations based on a thematic +classification, the director or the production period, a set of metadata has +been elaborated within the framework of this experimentation in order to +characterize the great variety of ``documentary filmmaking dispositifs'' . The +goal is to investigate the different ways in which the platform's film lovers +appropriate a personalized recommendation of 4 documentaries with similar or +similar filmmaking dispositifs. To conclude, the contributions and limits of +this proof of concept are discussed in order to sketch out avenues of +reflection for improving the instrumented mediation of documentary films. + +
+
+ comment: in French language +
+
+
+
+
+ + ☆ A Long-Tail Friendly Representation Framework for Artist and Music + Similarity + + +
+ The investigation of the similarity between artists and music is crucial in +music retrieval and recommendation, and addressing the challenge of the +long-tail phenomenon is increasingly important. This paper proposes a Long-Tail +Friendly Representation Framework (LTFRF) that utilizes neural networks to +model the similarity relationship. Our approach integrates music, user, +metadata, and relationship data into a unified metric learning framework, and +employs a meta-consistency relationship as a regular term to introduce the +Multi-Relationship Loss. Compared to the Graph Neural Network (GNN), our +proposed framework improves the representation performance in long-tail +scenarios, which are characterized by sparse relationships between artists and +music. We conduct experiments and analysis on the AllMusic dataset, and the +results demonstrate that our framework provides a favorable generalization of +artist and music representation. Specifically, on similar artist/music +recommendation tasks, the LTFRF outperforms the baseline by 9.69%/19.42% in Hit +Ratio@10, and in long-tail cases, the framework achieves 11.05%/14.14% higher +than the baseline in Consistent@10. + +
+
+
+
+
+ + ☆ PRISTA-Net: Deep Iterative Shrinkage Thresholding Network for Coded + Diffraction Patterns Phase Retrieval + + +
+ The problem of phase retrieval (PR) involves recovering an unknown image from +limited amplitude measurement data and is a challenge nonlinear inverse problem +in computational imaging and image processing. However, many of the PR methods +are based on black-box network models that lack interpretability and +plug-and-play (PnP) frameworks that are computationally complex and require +careful parameter tuning. To address this, we have developed PRISTA-Net, a deep +unfolding network (DUN) based on the first-order iterative shrinkage +thresholding algorithm (ISTA). This network utilizes a learnable nonlinear +transformation to address the proximal-point mapping sub-problem associated +with the sparse priors, and an attention mechanism to focus on phase +information containing image edges, textures, and structures. Additionally, the +fast Fourier transform (FFT) is used to learn global features to enhance local +information, and the designed logarithmic-based loss function leads to +significant improvements when the noise level is low. All parameters in the +proposed PRISTA-Net framework, including the nonlinear transformation, +threshold parameters, and step size, are learned end-to-end instead of being +manually set. This method combines the interpretability of traditional methods +with the fast inference ability of deep learning and is able to handle noise at +each iteration during the unfolding stage, thus improving recovery quality. +Experiments on Coded Diffraction Patterns (CDPs) measurements demonstrate that +our approach outperforms the existing state-of-the-art methods in terms of +qualitative and quantitative evaluations. Our source codes are available at +\emph{https://github.com/liuaxou/PRISTA-Net}. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ tSPM+; a high-performance algorithm for mining transitive sequential + patterns from clinical data + + +
+ The increasing availability of large clinical datasets collected from +patients can enable new avenues for computational characterization of complex +diseases using different analytic algorithms. One of the promising new methods +for extracting knowledge from large clinical datasets involves temporal pattern +mining integrated with machine learning workflows. However, mining these +temporal patterns is a computational intensive task and has memory +repercussions. Current algorithms, such as the temporal sequence pattern mining +(tSPM) algorithm, are already providing promising outcomes, but still leave +room for optimization. In this paper, we present the tSPM+ algorithm, a +high-performance implementation of the tSPM algorithm, which adds a new +dimension by adding the duration to the temporal patterns. We show that the +tSPM+ algorithm provides a speed up to factor 980 and a up to 48 fold +improvement in memory consumption. Moreover, we present a docker container with +an R-package, We also provide vignettes for an easy integration into already +existing machine learning workflows and use the mined temporal sequences to +identify Post COVID-19 patients and their symptoms according to the WHO +definition. + +
+
+ comment: Supplementary data: https://doi.org/10.5281/zenodo.8329519 +
+
+
+
+
+ + ☆ Modeling Recommender Ecosystems: Research Challenges at the Intersection + of Mechanism Design, Reinforcement Learning and Generative Models + + +
+ Modern recommender systems lie at the heart of complex ecosystems that couple +the behavior of users, content providers, advertisers, and other actors. +Despite this, the focus of the majority of recommender research -- and most +practical recommenders of any import -- is on the local, myopic optimization of +the recommendations made to individual users. This comes at a significant cost +to the long-term utility that recommenders could generate for its users. We +argue that explicitly modeling the incentives and behaviors of all actors in +the system -- and the interactions among them induced by the recommender's +policy -- is strictly necessary if one is to maximize the value the system +brings to these actors and improve overall ecosystem "health". Doing so +requires: optimization over long horizons using techniques such as +reinforcement learning; making inevitable tradeoffs in the utility that can be +generated for different actors using the methods of social choice; reducing +information asymmetry, while accounting for incentives and strategic behavior, +using the tools of mechanism design; better modeling of both user and +item-provider behaviors by incorporating notions from behavioral economics and +psychology; and exploiting recent advances in generative and foundation models +to make these mechanisms interpretable and actionable. We propose a conceptual +framework that encompasses these elements, and articulate a number of research +challenges that emerge at the intersection of these different disciplines. + +
+
+
+
+
+ + ♻ ☆ STIXnet: A Novel and Modular Solution for Extracting All STIX Objects in + CTI Reports + + +
+ The automatic extraction of information from Cyber Threat Intelligence (CTI) +reports is crucial in risk management. The increased frequency of the +publications of these reports has led researchers to develop new systems for +automatically recovering different types of entities and relations from textual +data. Most state-of-the-art models leverage Natural Language Processing (NLP) +techniques, which perform greatly in extracting a few types of entities at a +time but cannot detect heterogeneous data or their relations. Furthermore, +several paradigms, such as STIX, have become de facto standards in the CTI +community and dictate a formal categorization of different entities and +relations to enable organizations to share data consistently. This paper +presents STIXnet, the first solution for the automated extraction of all STIX +entities and relationships in CTI reports. Through the use of NLP techniques +and an interactive Knowledge Base (KB) of entities, our approach obtains F1 +scores comparable to state-of-the-art models for entity extraction (0.916) and +relation extraction (0.724) while considering significantly more types of +entities and relations. Moreover, STIXnet constitutes a modular and extensible +framework that manages and coordinates different modules to merge their +contributions uniquely and exhaustively. With our approach, researchers and +organizations can extend their Information Extraction (IE) capabilities by +integrating the efforts of several techniques without needing to develop new +tools from scratch. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Learning Compact Compositional Embeddings via Regularized Pruning for + Recommendation ICDM'23 + + +
+ Latent factor models are the dominant backbones of contemporary recommender +systems (RSs) given their performance advantages, where a unique vector +embedding with a fixed dimensionality (e.g., 128) is required to represent each +entity (commonly a user/item). Due to the large number of users and items on +e-commerce sites, the embedding table is arguably the least memory-efficient +component of RSs. For any lightweight recommender that aims to efficiently +scale with the growing size of users/items or to remain applicable in +resource-constrained settings, existing solutions either reduce the number of +embeddings needed via hashing, or sparsify the full embedding table to switch +off selected embedding dimensions. However, as hash collision arises or +embeddings become overly sparse, especially when adapting to a tighter memory +budget, those lightweight recommenders inevitably have to compromise their +accuracy. To this end, we propose a novel compact embedding framework for RSs, +namely Compositional Embedding with Regularized Pruning (CERP). Specifically, +CERP represents each entity by combining a pair of embeddings from two +independent, substantially smaller meta-embedding tables, which are then +jointly pruned via a learnable element-wise threshold. In addition, we +innovatively design a regularized pruning mechanism in CERP, such that the two +sparsified meta-embedding tables are encouraged to encode information that is +mutually complementary. Given the compatibility with agnostic latent factor +models, we pair CERP with two popular recommendation models for extensive +experiments, where results on two real-world datasets under different memory +budgets demonstrate its superiority against state-of-the-art baselines. The +codebase of CERP is available in https://github.com/xurong-liang/CERP. + +
+
+ comment: Accepted by ICDM'23 +
+
+
+
+
+
+
+
+ + Machine Learning 75 + +
+
+
+ + ☆ On the Actionability of Outcome Prediction + + +
+ Predicting future outcomes is a prevalent application of machine learning in +social impact domains. Examples range from predicting student success in +education to predicting disease risk in healthcare. Practitioners recognize +that the ultimate goal is not just to predict but to act effectively. +Increasing evidence suggests that relying on outcome predictions for downstream +interventions may not have desired results. + In most domains there exists a multitude of possible interventions for each +individual, making the challenge of taking effective action more acute. Even +when causal mechanisms connecting the individual's latent states to outcomes is +well understood, in any given instance (a specific student or patient), +practitioners still need to infer -- from budgeted measurements of latent +states -- which of many possible interventions will be most effective for this +individual. With this in mind, we ask: when are accurate predictors of outcomes +helpful for identifying the most suitable intervention? + Through a simple model encompassing actions, latent states, and measurements, +we demonstrate that pure outcome prediction rarely results in the most +effective policy for taking actions, even when combined with other +measurements. We find that except in cases where there is a single decisive +action for improving the outcome, outcome prediction never maximizes "action +value", the utility of taking actions. Making measurements of actionable latent +states, where specific actions lead to desired outcomes, considerably enhances +the action value compared to outcome prediction, and the degree of improvement +depends on action costs and the outcome model. This analysis emphasizes the +need to go beyond generic outcome prediction in interventional settings by +incorporating knowledge of plausible actions and latent states. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ Measuring and Improving Chain-of-Thought Reasoning in Vision-Language + Models + + +
+ Vision-language models (VLMs) have recently demonstrated strong efficacy as +visual assistants that can parse natural queries about the visual content and +generate human-like outputs. In this work, we explore the ability of these +models to demonstrate human-like reasoning based on the perceived information. +To address a crucial concern regarding the extent to which their reasoning +capabilities are fully consistent and grounded, we also measure the reasoning +consistency of these models. We achieve this by proposing a chain-of-thought +(CoT) based consistency measure. However, such an evaluation requires a +benchmark that encompasses both high-level inference and detailed reasoning +chains, which is costly. We tackle this challenge by proposing a +LLM-Human-in-the-Loop pipeline, which notably reduces cost while simultaneously +ensuring the generation of a high-quality dataset. Based on this pipeline and +the existing coarse-grained annotated dataset, we build the CURE benchmark to +measure both the zero-shot reasoning performance and consistency of VLMs. We +evaluate existing state-of-the-art VLMs, and find that even the best-performing +model is unable to demonstrate strong visual reasoning capabilities and +consistency, indicating that substantial efforts are required to enable VLMs to +perform visual reasoning as systematically and consistently as humans. As an +early step, we propose a two-stage training framework aimed at improving both +the reasoning performance and consistency of VLMs. The first stage involves +employing supervised fine-tuning of VLMs using step-by-step reasoning samples +automatically generated by LLMs. In the second stage, we further augment the +training process by incorporating feedback provided by LLMs to produce +reasoning chains that are highly consistent and grounded. We empirically +highlight the effectiveness of our framework in both reasoning performance and +consistency. + +
+
+ comment: The data is released at + \url{https://github.com/Yangyi-Chen/CoTConsistency} +
+
+
+
+
+ + ☆ Subwords as Skills: Tokenization for Sparse-Reward Reinforcement + Learning + + +
+ Exploration in sparse-reward reinforcement learning is difficult due to the +requirement of long, coordinated sequences of actions in order to achieve any +reward. Moreover, in continuous action spaces there are an infinite number of +possible actions, which only increases the difficulty of exploration. One class +of methods designed to address these issues forms temporally extended actions, +often called skills, from interaction data collected in the same domain, and +optimizes a policy on top of this new action space. Typically such methods +require a lengthy pretraining phase, especially in continuous action spaces, in +order to form the skills before reinforcement learning can begin. Given prior +evidence that the full range of the continuous action space is not required in +such tasks, we propose a novel approach to skill-generation with two +components. First we discretize the action space through clustering, and second +we leverage a tokenization technique borrowed from natural language processing +to generate temporally extended actions. Such a method outperforms baselines +for skill-generation in several challenging sparse-reward domains, and requires +orders-of-magnitude less computation in skill-generation and online rollouts. + +
+
+
+
+
+ + ☆ Postprocessing of Ensemble Weather Forecasts Using Permutation-invariant + Neural Networks + + +
+ Statistical postprocessing is used to translate ensembles of raw numerical +weather forecasts into reliable probabilistic forecast distributions. In this +study, we examine the use of permutation-invariant neural networks for this +task. In contrast to previous approaches, which often operate on ensemble +summary statistics and dismiss details of the ensemble distribution, we propose +networks which treat forecast ensembles as a set of unordered member forecasts +and learn link functions that are by design invariant to permutations of the +member ordering. We evaluate the quality of the obtained forecast distributions +in terms of calibration and sharpness, and compare the models against classical +and neural network-based benchmark methods. In case studies addressing the +postprocessing of surface temperature and wind gust forecasts, we demonstrate +state-of-the-art prediction quality. To deepen the understanding of the learned +inference process, we further propose a permutation-based importance analysis +for ensemble-valued predictors, which highlights specific aspects of the +ensemble forecast that are considered important by the trained postprocessing +models. Our results suggest that most of the relevant information is contained +in few ensemble-internal degrees of freedom, which may impact the design of +future ensemble forecasting and postprocessing systems. + +
+
+ comment: Submitted to Artificial Intelligence for the Earth Systems +
+
+
+
+
+ + ☆ Variations and Relaxations of Normalizing Flows + + +
+ Normalizing Flows (NFs) describe a class of models that express a complex +target distribution as the composition of a series of bijective transformations +over a simpler base distribution. By limiting the space of candidate +transformations to diffeomorphisms, NFs enjoy efficient, exact sampling and +density evaluation, enabling NFs to flexibly behave as both discriminative and +generative models. Their restriction to diffeomorphisms, however, enforces that +input, output and all intermediary spaces share the same dimension, limiting +their ability to effectively represent target distributions with complex +topologies. Additionally, in cases where the prior and target distributions are +not homeomorphic, Normalizing Flows can leak mass outside of the support of the +target. This survey covers a selection of recent works that combine aspects of +other generative model classes, such as VAEs and score-based diffusion, and in +doing so loosen the strict bijectivity constraints of NFs to achieve a balance +of expressivity, training speed, sample efficiency and likelihood tractability. + +
+
+
+
+
+ + ☆ Soft Quantization using Entropic Regularization + + +
+ The quantization problem aims to find the best possible approximation of +probability measures on ${\mathbb{R}}^d$ using finite, discrete measures. The +Wasserstein distance is a typical choice to measure the quality of the +approximation. This contribution investigates the properties and robustness of +the entropy-regularized quantization problem, which relaxes the standard +quantization problem. The proposed approximation technique naturally adopts the +softmin function, which is well known for its robustness in terms of +theoretical and practicability standpoints. Moreover, we use the +entropy-regularized Wasserstein distance to evaluate the quality of the soft +quantization problem's approximation, and we implement a stochastic gradient +approach to achieve the optimal solutions. The control parameter in our +proposed method allows for the adjustment of the optimization problem's +difficulty level, providing significant advantages when dealing with +exceptionally challenging problems of interest. As well, this contribution +empirically illustrates the performance of the method in various expositions. + +
+
+
+
+
+ + ☆ Robust Representation Learning for Privacy-Preserving Machine Learning: + A Multi-Objective Autoencoder Approach + + +
+ Several domains increasingly rely on machine learning in their applications. +The resulting heavy dependence on data has led to the emergence of various laws +and regulations around data ethics and privacy and growing awareness of the +need for privacy-preserving machine learning (ppML). Current ppML techniques +utilize methods that are either purely based on cryptography, such as +homomorphic encryption, or that introduce noise into the input, such as +differential privacy. The main criticism given to those techniques is the fact +that they either are too slow or they trade off a model s performance for +improved confidentiality. To address this performance reduction, we aim to +leverage robust representation learning as a way of encoding our data while +optimizing the privacy-utility trade-off. Our method centers on training +autoencoders in a multi-objective manner and then concatenating the latent and +learned features from the encoding part as the encoded form of our data. Such a +deep learning-powered encoding can then safely be sent to a third party for +intensive training and hyperparameter tuning. With our proposed framework, we +can share our data and use third party tools without being under the threat of +revealing its original form. We empirically validate our results on unimodal +and multimodal settings, the latter following a vertical splitting system and +show improved performance over state-of-the-art. + +
+
+
+
+
+ + ☆ Parallel and Limited Data Voice Conversion Using Stochastic Variational + Deep Kernel Learning + + +
+ Typically, voice conversion is regarded as an engineering problem with +limited training data. The reliance on massive amounts of data hinders the +practical applicability of deep learning approaches, which have been +extensively researched in recent years. On the other hand, statistical methods +are effective with limited data but have difficulties in modelling complex +mapping functions. This paper proposes a voice conversion method that works +with limited data and is based on stochastic variational deep kernel learning +(SVDKL). At the same time, SVDKL enables the use of deep neural networks' +expressive capability as well as the high flexibility of the Gaussian process +as a Bayesian and non-parametric method. When the conventional kernel is +combined with the deep neural network, it is possible to estimate non-smooth +and more complex functions. Furthermore, the model's sparse variational +Gaussian process solves the scalability problem and, unlike the exact Gaussian +process, allows for the learning of a global mapping function for the entire +acoustic space. One of the most important aspects of the proposed scheme is +that the model parameters are trained using marginal likelihood optimization, +which considers both data fitting and model complexity. Considering the +complexity of the model reduces the amount of training data by increasing the +resistance to overfitting. To evaluate the proposed scheme, we examined the +model's performance with approximately 80 seconds of training data. The results +indicated that our method obtained a higher mean opinion score, smaller +spectral distortion, and better preference tests than the compared methods. + +
+
+
+
+
+ + ☆ Emergent learning in physical systems as feedback-based aging in a + glassy landscape + + +
+ By training linear physical networks to learn linear transformations, we +discern how their physical properties evolve due to weight update rules. Our +findings highlight a striking similarity between the learning behaviors of such +networks and the processes of aging and memory formation in disordered and +glassy systems. We show that the learning dynamics resembles an aging process, +where the system relaxes in response to repeated application of the feedback +boundary forces in presence of an input force, thus encoding a memory of the +input-output relationship. With this relaxation comes an increase in the +correlation length, which is indicated by the two-point correlation function +for the components of the network. We also observe that the square root of the +mean-squared error as a function of epoch takes on a non-exponential form, +which is a typical feature of glassy systems. This physical interpretation +suggests that by encoding more detailed information into input and feedback +boundary forces, the process of emergent learning can be rather ubiquitous and, +thus, serve as a very early physical mechanism, from an evolutionary +standpoint, for learning in biological systems. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Generalization Bounds: Perspectives from Information Theory and + PAC-Bayes + + +
+ A fundamental question in theoretical machine learning is generalization. +Over the past decades, the PAC-Bayesian approach has been established as a +flexible framework to address the generalization capabilities of machine +learning algorithms, and design new ones. Recently, it has garnered increased +interest due to its potential applicability for a variety of learning +algorithms, including deep neural networks. In parallel, an +information-theoretic view of generalization has developed, wherein the +relation between generalization and various information measures has been +established. This framework is intimately connected to the PAC-Bayesian +approach, and a number of results have been independently discovered in both +strands. In this monograph, we highlight this strong connection and present a +unified treatment of generalization. We present techniques and results that the +two perspectives have in common, and discuss the approaches and interpretations +that differ. In particular, we demonstrate how many proofs in the area share a +modular structure, through which the underlying ideas can be intuited. We pay +special attention to the conditional mutual information (CMI) framework; +analytical studies of the information complexity of learning algorithms; and +the application of the proposed methods to deep learning. This monograph is +intended to provide a comprehensive introduction to information-theoretic +generalization bounds and their connection to PAC-Bayes, serving as a +foundation from which the most recent developments are accessible. It is aimed +broadly towards researchers with an interest in generalization and theoretical +machine learning. + +
+
+ comment: 222 pages +
+
+
+
+
+ + ☆ Seeing-Eye Quadruped Navigation with Force Responsive Locomotion Control + + +
+ Seeing-eye robots are very useful tools for guiding visually impaired people, +potentially producing a huge societal impact given the low availability and +high cost of real guide dogs. Although a few seeing-eye robot systems have +already been demonstrated, none considered external tugs from humans, which +frequently occur in a real guide dog setting. In this paper, we simultaneously +train a locomotion controller that is robust to external tugging forces via +Reinforcement Learning (RL), and an external force estimator via supervised +learning. The controller ensures stable walking, and the force estimator +enables the robot to respond to the external forces from the human. These +forces are used to guide the robot to the global goal, which is unknown to the +robot, while the robot guides the human around nearby obstacles via a local +planner. Experimental results in simulation and on hardware show that our +controller is robust to external forces, and our seeing-eye system can +accurately detect force direction. We demonstrate our full seeing-eye robot +system on a real quadruped robot with a blindfolded human. The video can be +seen at our project page: https://bu-air-lab.github.io/guide_dog/ + +
+
+ comment: Accepted to CoRL 2023 +
+
+
+
+
+ + ☆ Active Learning for Classifying 2D Grid-Based Level Completability + + +
+ Determining the completability of levels generated by procedural generators +such as machine learning models can be challenging, as it can involve the use +of solver agents that often require a significant amount of time to analyze and +solve levels. Active learning is not yet widely adopted in game evaluations, +although it has been used successfully in natural language processing, image +and speech recognition, and computer vision, where the availability of labeled +data is limited or expensive. In this paper, we propose the use of active +learning for learning level completability classification. Through an active +learning approach, we train deep-learning models to classify the completability +of generated levels for Super Mario Bros., Kid Icarus, and a Zelda-like game. +We compare active learning for querying levels to label with completability +against random queries. Our results show using an active learning approach to +label levels results in better classifier performance with the same amount of +labeled data. + +
+
+ comment: 4 pages, 3 figures +
+
+
+
+
+ + ☆ Learning from Power Signals: An Automated Approach to Electrical + Disturbance Identification Within a Power Transmission System + + +
+ As power quality becomes a higher priority in the electric utility industry, +the amount of disturbance event data continues to grow. Utilities do not have +the required personnel to analyze each event by hand. This work presents an +automated approach for analyzing power quality events recorded by digital fault +recorders and power quality monitors operating within a power transmission +system. The automated approach leverages rule-based analytics to examine the +time and frequency domain characteristics of the voltage and current signals. +Customizable thresholds are set to categorize each disturbance event. The +events analyzed within this work include various faults, motor starting, and +incipient instrument transformer failure. Analytics for fourteen different +event types have been developed. The analytics were tested on 160 signal files +and yielded an accuracy of ninety-nine percent. Continuous, nominal signal data +analysis is performed using an approach coined as the cyclic histogram. The +cyclic histogram process will be integrated into the digital fault recorders +themselves to facilitate the detection of subtle signal variations that are too +small to trigger a disturbance event and that can occur over hours or days. In +addition to reducing memory requirements by a factor of 320, it is anticipated +that cyclic histogram processing will aid in identifying incipient events and +identifiers. This project is expected to save engineers time by automating the +classification of disturbance events and increase the reliability of the +transmission system by providing near real time detection and identification of +disturbances as well as prevention of problems before they occur. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Value-Compressed Sparse Column (VCSC): Sparse Matrix Storage for + Redundant Data + + +
+ Compressed Sparse Column (CSC) and Coordinate (COO) are popular compression +formats for sparse matrices. However, both CSC and COO are general purpose and +cannot take advantage of any of the properties of the data other than sparsity, +such as data redundancy. Highly redundant sparse data is common in many machine +learning applications, such as genomics, and is often too large for in-core +computation using conventional sparse storage formats. In this paper, we +present two extensions to CSC: (1) Value-Compressed Sparse Column (VCSC) and +(2) Index- and Value-Compressed Sparse Column (IVCSC). VCSC takes advantage of +high redundancy within a column to further compress data up to 3-fold over COO +and 2.25-fold over CSC, without significant negative impact to performance +characteristics. IVCSC extends VCSC by compressing index arrays through delta +encoding and byte-packing, achieving a 10-fold decrease in memory usage over +COO and 7.5-fold decrease over CSC. Our benchmarks on simulated and real data +show that VCSC and IVCSC can be read in compressed form with little added +computational cost. These two novel compression formats offer a broadly useful +solution to encoding and reading redundant sparse data. + +
+
+
+
+
+ + ☆ Mobile V-MoEs: Scaling Down Vision Transformers via Sparse + Mixture-of-Experts + + +
+ Sparse Mixture-of-Experts models (MoEs) have recently gained popularity due +to their ability to decouple model size from inference efficiency by only +activating a small subset of the model parameters for any given input token. As +such, sparse MoEs have enabled unprecedented scalability, resulting in +tremendous successes across domains such as natural language processing and +computer vision. In this work, we instead explore the use of sparse MoEs to +scale-down Vision Transformers (ViTs) to make them more attractive for +resource-constrained vision applications. To this end, we propose a simplified +and mobile-friendly MoE design where entire images rather than individual +patches are routed to the experts. We also propose a stable MoE training +procedure that uses super-class information to guide the router. We empirically +show that our sparse Mobile Vision MoEs (V-MoEs) can achieve a better trade-off +between performance and efficiency than the corresponding dense ViTs. For +example, for the ViT-Tiny model, our Mobile V-MoE outperforms its dense +counterpart by 3.39% on ImageNet-1k. For an even smaller ViT variant with only +54M FLOPs inference cost, our MoE achieves an improvement of 4.66%. + +
+
+
+
+
+ + ☆ Zero-Shot Robustification of Zero-Shot Models With Foundation Models + + +
+ Zero-shot inference is a powerful paradigm that enables the use of large +pretrained models for downstream classification tasks without further training. +However, these models are vulnerable to inherited biases that can impact their +performance. The traditional solution is fine-tuning, but this undermines the +key advantage of pretrained models, which is their ability to be used +out-of-the-box. We propose RoboShot, a method that improves the robustness of +pretrained model embeddings in a fully zero-shot fashion. First, we use +zero-shot language models (LMs) to obtain useful insights from task +descriptions. These insights are embedded and used to remove harmful and boost +useful components in embeddings -- without any supervision. Theoretically, we +provide a simple and tractable model for biases in zero-shot embeddings and +give a result characterizing under what conditions our approach can boost +performance. Empirically, we evaluate RoboShot on nine image and NLP +classification tasks and show an average improvement of 15.98% over several +zero-shot baselines. Additionally, we demonstrate that RoboShot is compatible +with a variety of pretrained and language models. + +
+
+
+
+
+ + ☆ Online Submodular Maximization via Online Convex Optimization + + +
+ We study monotone submodular maximization under general matroid constraints +in the online setting. We prove that online optimization of a large class of +submodular functions, namely, weighted threshold potential functions, reduces +to online convex optimization (OCO). This is precisely because functions in +this class admit a concave relaxation; as a result, OCO policies, coupled with +an appropriate rounding scheme, can be used to achieve sublinear regret in the +combinatorial setting. We show that our reduction extends to many different +versions of the online learning problem, including the dynamic regret, bandit, +and optimistic-learning settings. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Encoding Multi-Domain Scientific Papers by Ensembling Multiple CLS + Tokens + + +
+ Many useful tasks on scientific documents, such as topic classification and +citation prediction, involve corpora that span multiple scientific domains. +Typically, such tasks are accomplished by representing the text with a vector +embedding obtained from a Transformer's single CLS token. In this paper, we +argue that using multiple CLS tokens could make a Transformer better specialize +to multiple scientific domains. We present Multi2SPE: it encourages each of +multiple CLS tokens to learn diverse ways of aggregating token embeddings, then +sums them up together to create a single vector representation. We also propose +our new multi-domain benchmark, Multi-SciDocs, to test scientific paper vector +encoders under multi-domain settings. We show that Multi2SPE reduces error by +up to 25 percent in multi-domain citation prediction, while requiring only a +negligible amount of computation in addition to one BERT forward pass. + +
+
+
+
+
+ + ☆ Graph Neural Networks Use Graphs When They Shouldn't + + +
+ Predictions over graphs play a crucial role in various domains, including +social networks, molecular biology, medicine, and more. Graph Neural Networks +(GNNs) have emerged as the dominant approach for learning on graph data. +Instances of graph labeling problems consist of the graph-structure (i.e., the +adjacency matrix), along with node-specific feature vectors. In some cases, +this graph-structure is non-informative for the predictive task. For instance, +molecular properties such as molar mass depend solely on the constituent atoms +(node features), and not on the molecular structure. While GNNs have the +ability to ignore the graph-structure in such cases, it is not clear that they +will. In this work, we show that GNNs actually tend to overfit the +graph-structure in the sense that they use it even when a better solution can +be obtained by ignoring it. We examine this phenomenon with respect to +different graph distributions and find that regular graphs are more robust to +this overfitting. We then provide a theoretical explanation for this +phenomenon, via analyzing the implicit bias of gradient-descent-based learning +of GNNs in this setting. Finally, based on our empirical and theoretical +findings, we propose a graph-editing method to mitigate the tendency of GNNs to +overfit graph-structures that should be ignored. We show that this method +indeed improves the accuracy of GNNs across multiple benchmarks. + +
+
+
+
+
+ + ☆ Generating the Ground Truth: Synthetic Data for Label Noise Research + + +
+ Most real-world classification tasks suffer from label noise to some extent. +Such noise in the data adversely affects the generalization error of learned +models and complicates the evaluation of noise-handling methods, as their +performance cannot be accurately measured without clean labels. In label noise +research, typically either noisy or incomplex simulated data are accepted as a +baseline, into which additional noise with known properties is injected. In +this paper, we propose SYNLABEL, a framework that aims to improve upon the +aforementioned methodologies. It allows for creating a noiseless dataset +informed by real data, by either pre-specifying or learning a function and +defining it as the ground truth function from which labels are generated. +Furthermore, by resampling a number of values for selected features in the +function domain, evaluating the function and aggregating the resulting labels, +each data point can be assigned a soft label or label distribution. Such +distributions allow for direct injection and quantification of label noise. The +generated datasets serve as a clean baseline of adjustable complexity into +which different types of noise may be introduced. We illustrate how the +framework can be applied, how it enables quantification of label noise and how +it improves over existing methodologies. + +
+
+
+
+
+ + ☆ Actor critic learning algorithms for mean-field control with moment + neural networks + + +
+ We develop a new policy gradient and actor-critic algorithm for solving +mean-field control problems within a continuous time reinforcement learning +setting. Our approach leverages a gradient-based representation of the value +function, employing parametrized randomized policies. The learning for both the +actor (policy) and critic (value function) is facilitated by a class of moment +neural network functions on the Wasserstein space of probability measures, and +the key feature is to sample directly trajectories of distributions. A central +challenge addressed in this study pertains to the computational treatment of an +operator specific to the mean-field framework. To illustrate the effectiveness +of our methods, we provide a comprehensive set of numerical results. These +encompass diverse examples, including multi-dimensional settings and nonlinear +quadratic mean-field control problems with controlled volatility. + +
+
+ comment: 16 pages, 11 figures +
+
+
+
+
+ + ☆ Federated Learning for Early Dropout Prediction on Healthy Ageing + Applications + + +
+ The provision of social care applications is crucial for elderly people to +improve their quality of life and enables operators to provide early +interventions. Accurate predictions of user dropouts in healthy ageing +applications are essential since they are directly related to individual health +statuses. Machine Learning (ML) algorithms have enabled highly accurate +predictions, outperforming traditional statistical methods that struggle to +cope with individual patterns. However, ML requires a substantial amount of +data for training, which is challenging due to the presence of personal +identifiable information (PII) and the fragmentation posed by regulations. In +this paper, we present a federated machine learning (FML) approach that +minimizes privacy concerns and enables distributed training, without +transferring individual data. We employ collaborative training by considering +individuals and organizations under FML, which models both cross-device and +cross-silo learning scenarios. Our approach is evaluated on a real-world +dataset with non-independent and identically distributed (non-iid) data among +clients, class imbalance and label ambiguity. Our results show that data +selection and class imbalance handling techniques significantly improve the +predictive accuracy of models trained under FML, demonstrating comparable or +superior predictive performance than traditional ML models. + +
+
+
+
+
+ + ☆ Navigating Out-of-Distribution Electricity Load Forecasting during + COVID-19: A Continual Learning Approach Leveraging Human Mobility + + +
+ In traditional deep learning algorithms, one of the key assumptions is that +the data distribution remains constant during both training and deployment. +However, this assumption becomes problematic when faced with +Out-of-Distribution periods, such as the COVID-19 lockdowns, where the data +distribution significantly deviates from what the model has seen during +training. This paper employs a two-fold strategy: utilizing continual learning +techniques to update models with new data and harnessing human mobility data +collected from privacy-preserving pedestrian counters located outside +buildings. In contrast to online learning, which suffers from 'catastrophic +forgetting' as newly acquired knowledge often erases prior information, +continual learning offers a holistic approach by preserving past insights while +integrating new data. This research applies FSNet, a powerful continual +learning algorithm, to real-world data from 13 building complexes in Melbourne, +Australia, a city which had the second longest total lockdown duration globally +during the pandemic. Results underscore the crucial role of continual learning +in accurate energy forecasting, particularly during Out-of-Distribution +periods. Secondary data such as mobility and temperature provided ancillary +support to the primary forecasting model. More importantly, while traditional +methods struggled to adapt during lockdowns, models featuring at least online +learning demonstrated resilience, with lockdown periods posing fewer challenges +once armed with adaptive learning techniques. This study contributes valuable +methodologies and insights to the ongoing effort to improve energy load +forecasting during future Out-of-Distribution periods. + +
+
+ comment: 10 pages, 2 figures, 5 tables, BuildSys '23 +
+
+
+
+
+ + ☆ Viewing the process of generating counterfactuals as a source of + knowledge -- Application to the Naive Bayes classifier + + +
+ There are now many comprehension algorithms for understanding the decisions +of a machine learning algorithm. Among these are those based on the generation +of counterfactual examples. This article proposes to view this generation +process as a source of creating a certain amount of knowledge that can be +stored to be used, later, in different ways. This process is illustrated in the +additive model and, more specifically, in the case of the naive Bayes +classifier, whose interesting properties for this purpose are shown. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Learning Zero-Sum Linear Quadratic Games with Improved Sample Complexity + + +
+ Zero-sum Linear Quadratic (LQ) games are fundamental in optimal control and +can be used (i) as a dynamic game formulation for risk-sensitive or robust +control, or (ii) as a benchmark setting for multi-agent reinforcement learning +with two competing agents in continuous state-control spaces. In contrast to +the well-studied single-agent linear quadratic regulator problem, zero-sum LQ +games entail solving a challenging nonconvex-nonconcave min-max problem with an +objective function that lacks coercivity. Recently, Zhang et al. discovered an +implicit regularization property of natural policy gradient methods which is +crucial for safety-critical control systems since it preserves the robustness +of the controller during learning. Moreover, in the model-free setting where +the knowledge of model parameters is not available, Zhang et al. proposed the +first polynomial sample complexity algorithm to reach an +$\epsilon$-neighborhood of the Nash equilibrium while maintaining the desirable +implicit regularization property. In this work, we propose a simpler nested +Zeroth-Order (ZO) algorithm improving sample complexity by several orders of +magnitude. Our main result guarantees a +$\widetilde{\mathcal{O}}(\epsilon^{-3})$ sample complexity under the same +assumptions using a single-point ZO estimator. Furthermore, when the estimator +is replaced by a two-point estimator, our method enjoys a better +$\widetilde{\mathcal{O}}(\epsilon^{-2})$ sample complexity. Our key +improvements rely on a more sample-efficient nested algorithm design and finer +control of the ZO natural gradient estimation error. + +
+
+
+
+
+ + ☆ Optimal Rate of Kernel Regression in Large Dimensions + + +
+ We perform a study on kernel regression for large-dimensional data (where the +sample size $n$ is polynomially depending on the dimension $d$ of the samples, +i.e., $n\asymp d^{\gamma}$ for some $\gamma >0$ ). We first build a general +tool to characterize the upper bound and the minimax lower bound of kernel +regression for large dimensional data through the Mendelson complexity +$\varepsilon_{n}^{2}$ and the metric entropy $\bar{\varepsilon}_{n}^{2}$ +respectively. When the target function falls into the RKHS associated with a +(general) inner product model defined on $\mathbb{S}^{d}$, we utilize the new +tool to show that the minimax rate of the excess risk of kernel regression is +$n^{-1/2}$ when $n\asymp d^{\gamma}$ for $\gamma =2, 4, 6, 8, \cdots$. We then +further determine the optimal rate of the excess risk of kernel regression for +all the $\gamma>0$ and find that the curve of optimal rate varying along +$\gamma$ exhibits several new phenomena including the {\it multiple descent +behavior} and the {\it periodic plateau behavior}. As an application, For the +neural tangent kernel (NTK), we also provide a similar explicit description of +the curve of optimal rate. As a direct corollary, we know these claims hold for +wide neural networks as well. + +
+
+
+
+
+ + ☆ Adaptive Distributed Kernel Ridge Regression: A Feasible Distributed + Learning Scheme for Data Silos + + +
+ Data silos, mainly caused by privacy and interoperability, significantly +constrain collaborations among different organizations with similar data for +the same purpose. Distributed learning based on divide-and-conquer provides a +promising way to settle the data silos, but it suffers from several challenges, +including autonomy, privacy guarantees, and the necessity of collaborations. +This paper focuses on developing an adaptive distributed kernel ridge +regression (AdaDKRR) by taking autonomy in parameter selection, privacy in +communicating non-sensitive information, and the necessity of collaborations in +performance improvement into account. We provide both solid theoretical +verification and comprehensive experiments for AdaDKRR to demonstrate its +feasibility and effectiveness. Theoretically, we prove that under some mild +conditions, AdaDKRR performs similarly to running the optimal learning +algorithms on the whole data, verifying the necessity of collaborations and +showing that no other distributed learning scheme can essentially beat AdaDKRR +under the same conditions. Numerically, we test AdaDKRR on both toy simulations +and two real-world applications to show that AdaDKRR is superior to other +existing distributed learning schemes. All these results show that AdaDKRR is a +feasible scheme to defend against data silos, which are highly desired in +numerous application regions such as intelligent decision-making, pricing +forecasting, and performance prediction for products. + +
+
+ comment: 46pages, 13figures +
+
+
+
+
+ + ☆ Offline Recommender System Evaluation under Unobserved Confounding RecSys '23 + + +
+ Off-Policy Estimation (OPE) methods allow us to learn and evaluate +decision-making policies from logged data. This makes them an attractive choice +for the offline evaluation of recommender systems, and several recent works +have reported successful adoption of OPE methods to this end. An important +assumption that makes this work is the absence of unobserved confounders: +random variables that influence both actions and rewards at data collection +time. Because the data collection policy is typically under the practitioner's +control, the unconfoundedness assumption is often left implicit, and its +violations are rarely dealt with in the existing literature. + This work aims to highlight the problems that arise when performing +off-policy estimation in the presence of unobserved confounders, specifically +focusing on a recommendation use-case. We focus on policy-based estimators, +where the logging propensities are learned from logged data. We characterise +the statistical bias that arises due to confounding, and show how existing +diagnostics are unable to uncover such cases. Because the bias depends directly +on the true and unobserved logging propensities, it is non-identifiable. As the +unconfoundedness assumption is famously untestable, this becomes especially +problematic. This paper emphasises this common, yet often overlooked issue. +Through synthetic data, we empirically show how na\"ive propensity estimation +under confounding can lead to severely biased metric estimates that are allowed +to fly under the radar. We aim to cultivate an awareness among researchers and +practitioners of this important problem, and touch upon potential research +directions towards mitigating its effects. + +
+
+ comment: Accepted at the CONSEQUENCES'23 workshop at RecSys '23 +
+
+
+
+
+ + ☆ Concomitant Group Testing + + +
+ In this paper, we introduce a variation of the group testing problem +capturing the idea that a positive test requires a combination of multiple +``types'' of item. Specifically, we assume that there are multiple disjoint +\emph{semi-defective sets}, and a test is positive if and only if it contains +at least one item from each of these sets. The goal is to reliably identify all +of the semi-defective sets using as few tests as possible, and we refer to this +problem as \textit{Concomitant Group Testing} (ConcGT). We derive a variety of +algorithms for this task, focusing primarily on the case that there are two +semi-defective sets. Our algorithms are distinguished by (i) whether they are +deterministic (zero-error) or randomized (small-error), and (ii) whether they +are non-adaptive, fully adaptive, or have limited adaptivity (e.g., 2 or 3 +stages). Both our deterministic adaptive algorithm and our randomized +algorithms (non-adaptive or limited adaptivity) are order-optimal in broad +scaling regimes of interest, and improve significantly over baseline results +that are based on solving a more general problem as an intermediate step (e.g., +hypergraph learning). + +
+
+ comment: 15 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ Counterfactual Explanations via Locally-guided Sequential Algorithmic + Recourse + + +
+ Counterfactuals operationalised through algorithmic recourse have become a +powerful tool to make artificial intelligence systems explainable. +Conceptually, given an individual classified as y -- the factual -- we seek +actions such that their prediction becomes the desired class y' -- the +counterfactual. This process offers algorithmic recourse that is (1) easy to +customise and interpret, and (2) directly aligned with the goals of each +individual. However, the properties of a "good" counterfactual are still +largely debated; it remains an open challenge to effectively locate a +counterfactual along with its corresponding recourse. Some strategies use +gradient-driven methods, but these offer no guarantees on the feasibility of +the recourse and are open to adversarial attacks on carefully created +manifolds. This can lead to unfairness and lack of robustness. Other methods +are data-driven, which mostly addresses the feasibility problem at the expense +of privacy, security and secrecy as they require access to the entire training +data set. Here, we introduce LocalFACE, a model-agnostic technique that +composes feasible and actionable counterfactual explanations using +locally-acquired information at each step of the algorithmic recourse. Our +explainer preserves the privacy of users by only leveraging data that it +specifically requires to construct actionable algorithmic recourse, and +protects the model by offering transparency solely in the regions deemed +necessary for the intervention. + +
+
+ comment: 7 pages, 5 figures, 3 appendix pages +
+
+
+
+
+ + ☆ Towards Mitigating Architecture Overfitting in Dataset Distillation + + +
+ Dataset distillation methods have demonstrated remarkable performance for +neural networks trained with very limited training data. However, a significant +challenge arises in the form of architecture overfitting: the distilled +training data synthesized by a specific network architecture (i.e., training +network) generates poor performance when trained by other network architectures +(i.e., test networks). This paper addresses this issue and proposes a series of +approaches in both architecture designs and training schemes which can be +adopted together to boost the generalization performance across different +network architectures on the distilled training data. We conduct extensive +experiments to demonstrate the effectiveness and generality of our methods. +Particularly, across various scenarios involving different sizes of distilled +data, our approaches achieve comparable or superior performance to existing +methods when training on the distilled data using networks with larger +capacities. + +
+
+
+
+
+ + ☆ Leveraging Prototype Patient Representations with Feature-Missing-Aware + Calibration to Mitigate EHR Data Sparsity + + +
+ Electronic Health Record (EHR) data frequently exhibits sparse +characteristics, posing challenges for predictive modeling. Current direct +imputation such as matrix imputation approaches hinge on referencing analogous +rows or columns to complete raw missing data and do not differentiate between +imputed and actual values. As a result, models may inadvertently incorporate +irrelevant or deceptive information with respect to the prediction objective, +thereby compromising the efficacy of downstream performance. While some methods +strive to recalibrate or augment EHR embeddings after direct imputation, they +often mistakenly prioritize imputed features. This misprioritization can +introduce biases or inaccuracies into the model. To tackle these issues, our +work resorts to indirect imputation, where we leverage prototype +representations from similar patients to obtain a denser embedding. Recognizing +the limitation that missing features are typically treated the same as present +ones when measuring similar patients, our approach designs a feature confidence +learner module. This module is sensitive to the missing feature status, +enabling the model to better judge the reliability of each feature. Moreover, +we propose a novel patient similarity metric that takes feature confidence into +account, ensuring that evaluations are not based merely on potentially +inaccurate imputed values. Consequently, our work captures dense prototype +patient representations with feature-missing-aware calibration process. +Comprehensive experiments demonstrate that designed model surpasses established +EHR-focused models with a statistically significant improvement on MIMIC-III +and MIMIC-IV datasets in-hospital mortality outcome prediction task. The code +is publicly available at \url{https://anonymous.4open.science/r/SparseEHR} to +assure the reproducibility. + +
+
+
+
+
+ + ☆ A Deep Learning Method for Sensitivity Enhancement of Deuterium + Metabolic Imaging (DMI) + + +
+ Purpose: Common to most MRSI techniques, the spatial resolution and the +minimal scan duration of Deuterium Metabolic Imaging (DMI) are limited by the +achievable SNR. This work presents a deep learning method for sensitivity +enhancement of DMI. + Methods: A convolutional neural network (CNN) was designed to estimate the +2H-labeled metabolite concentrations from low SNR and distorted DMI FIDs. The +CNN was trained with synthetic data that represent a range of SNR levels +typically encountered in vivo. The estimation precision was further improved by +fine-tuning the CNN with MRI-based edge-preserving regularization for each DMI +dataset. The proposed processing method, PReserved Edge ConvolutIonal neural +network for Sensitivity Enhanced DMI (PRECISE-DMI), was applied to simulation +studies and in vivo experiments to evaluate the anticipated improvements in SNR +and investigate the potential for inaccuracies. + Results: PRECISE-DMI visually improved the metabolic maps of low SNR +datasets, and quantitatively provided higher precision than the standard +Fourier reconstruction. Processing of DMI data acquired in rat brain tumor +models resulted in more precise determination of 2H-labeled lactate and +glutamate + glutamine levels, at increased spatial resolution (from >8 to 2 +$\mu$L) or shortened scan time (from 32 to 4 min) compared to standard +acquisitions. However, rigorous SD-bias analyses showed that overuse of the +edge-preserving regularization can compromise the accuracy of the results. + Conclusion: PRECISE-DMI allows a flexible trade-off between enhancing the +sensitivity of DMI and minimizing the inaccuracies. With typical settings, the +DMI sensitivity can be improved by 3-fold while retaining the capability to +detect local signal variations. + +
+
+
+
+
+ + ☆ Sample-Efficient Co-Design of Robotic Agents Using Multi-fidelity + Training on Universal Policy Network + + +
+ Co-design involves simultaneously optimizing the controller and agents +physical design. Its inherent bi-level optimization formulation necessitates an +outer loop design optimization driven by an inner loop control optimization. +This can be challenging when the design space is large and each design +evaluation involves data-intensive reinforcement learning process for control +optimization. To improve the sample-efficiency we propose a +multi-fidelity-based design exploration strategy based on Hyperband where we +tie the controllers learnt across the design spaces through a universal policy +learner for warm-starting the subsequent controller learning problems. Further, +we recommend a particular way of traversing the Hyperband generated design +matrix that ensures that the stochasticity of the Hyperband is reduced the most +with the increasing warm starting effect of the universal policy learner as it +is strengthened with each new design evaluation. Experiments performed on a +wide range of agent design problems demonstrate the superiority of our method +compared to the baselines. Additionally, analysis of the optimized designs +shows interesting design alterations including design simplifications and +non-intuitive alterations that have emerged in the biological world. + +
+
+ comment: 17 pages, 10 figures +
+
+
+
+
+ + ☆ Curve Your Attention: Mixed-Curvature Transformers for Graph + Representation Learning + + +
+ Real-world graphs naturally exhibit hierarchical or cyclical structures that +are unfit for the typical Euclidean space. While there exist graph neural +networks that leverage hyperbolic or spherical spaces to learn representations +that embed such structures more accurately, these methods are confined under +the message-passing paradigm, making the models vulnerable against side-effects +such as oversmoothing and oversquashing. More recent work have proposed global +attention-based graph Transformers that can easily model long-range +interactions, but their extensions towards non-Euclidean geometry are yet +unexplored. To bridge this gap, we propose Fully Product-Stereographic +Transformer, a generalization of Transformers towards operating entirely on the +product of constant curvature spaces. When combined with tokenized graph +Transformers, our model can learn the curvature appropriate for the input graph +in an end-to-end fashion, without the need of additional tuning on different +curvature initializations. We also provide a kernelized approach to +non-Euclidean attention, which enables our model to run in time and memory cost +linear to the number of nodes and edges while respecting the underlying +geometry. Experiments on graph reconstruction and node classification +demonstrate the benefits of generalizing Transformers to the non-Euclidean +domain. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ UER: A Heuristic Bias Addressing Approach for Online Continual Learning ACM MM2023 + + +
+ Online continual learning aims to continuously train neural networks from a +continuous data stream with a single pass-through data. As the most effective +approach, the rehearsal-based methods replay part of previous data. Commonly +used predictors in existing methods tend to generate biased dot-product logits +that prefer to the classes of current data, which is known as a bias issue and +a phenomenon of forgetting. Many approaches have been proposed to overcome the +forgetting problem by correcting the bias; however, they still need to be +improved in online fashion. In this paper, we try to address the bias issue by +a more straightforward and more efficient method. By decomposing the +dot-product logits into an angle factor and a norm factor, we empirically find +that the bias problem mainly occurs in the angle factor, which can be used to +learn novel knowledge as cosine logits. On the contrary, the norm factor +abandoned by existing methods helps remember historical knowledge. Based on +this observation, we intuitively propose to leverage the norm factor to balance +the new and old knowledge for addressing the bias. To this end, we develop a +heuristic approach called unbias experience replay (UER). UER learns current +samples only by the angle factor and further replays previous samples by both +the norm and angle factors. Extensive experiments on three datasets show that +UER achieves superior performance over various state-of-the-art methods. The +code is in https://github.com/FelixHuiweiLin/UER. + +
+
+ comment: 9 pages, 12 figures, ACM MM2023 +
+
+
+
+
+ + ☆ Enabling the Evaluation of Driver Physiology Via Vehicle Dynamics + + +
+ Driving is a daily routine for many individuals across the globe. This paper +presents the configuration and methodologies used to transform a vehicle into a +connected ecosystem capable of assessing driver physiology. We integrated an +array of commercial sensors from the automotive and digital health sectors +along with driver inputs from the vehicle itself. This amalgamation of sensors +allows for meticulous recording of the external conditions and driving +maneuvers. These data streams are processed to extract key parameters, +providing insights into driver behavior in relation to their external +environment and illuminating vital physiological responses. This innovative +driver evaluation system holds the potential to amplify road safety. Moreover, +when paired with data from conventional health settings, it may enhance early +detection of health-related complications. + +
+
+ comment: 7 pages, 11 figures, 2023 IEEE International Conference on Digital + Health (ICDH) +
+
+
+
+
+ + ☆ Riemannian Langevin Monte Carlo schemes for sampling PSD matrices with + fixed rank + + +
+ This paper introduces two explicit schemes to sample matrices from Gibbs +distributions on $\mathcal S^{n,p}_+$, the manifold of real positive +semi-definite (PSD) matrices of size $n\times n$ and rank $p$. Given an energy +function $\mathcal E:\mathcal S^{n,p}_+\to \mathbb{R}$ and certain Riemannian +metrics $g$ on $\mathcal S^{n,p}_+$, these schemes rely on an Euler-Maruyama +discretization of the Riemannian Langevin equation (RLE) with Brownian motion +on the manifold. We present numerical schemes for RLE under two fundamental +metrics on $\mathcal S^{n,p}_+$: (a) the metric obtained from the embedding of +$\mathcal S^{n,p}_+ \subset \mathbb{R}^{n\times n} $; and (b) the +Bures-Wasserstein metric corresponding to quotient geometry. We also provide +examples of energy functions with explicit Gibbs distributions that allow +numerical validation of these schemes. + +
+
+
+
+
+ + ☆ 3D Denoisers are Good 2D Teachers: Molecular Pretraining via Denoising + and Cross-Modal Distillation + + +
+ Pretraining molecular representations from large unlabeled data is essential +for molecular property prediction due to the high cost of obtaining +ground-truth labels. While there exist various 2D graph-based molecular +pretraining approaches, these methods struggle to show statistically +significant gains in predictive performance. Recent work have thus instead +proposed 3D conformer-based pretraining under the task of denoising, which led +to promising results. During downstream finetuning, however, models trained +with 3D conformers require accurate atom-coordinates of previously unseen +molecules, which are computationally expensive to acquire at scale. In light of +this limitation, we propose D&D, a self-supervised molecular representation +learning framework that pretrains a 2D graph encoder by distilling +representations from a 3D denoiser. With denoising followed by cross-modal +knowledge distillation, our approach enjoys use of knowledge obtained from +denoising as well as painless application to downstream tasks with no access to +accurate conformers. Experiments on real-world molecular property prediction +datasets show that the graph encoder trained via D&D can infer 3D information +based on the 2D graph and shows superior performance and label-efficiency +against other baselines. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Soft-Bellman Equilibrium in Affine Markov Games: Forward Solutions and + Inverse Learning + + +
+ Markov games model interactions among multiple players in a stochastic, +dynamic environment. Each player in a Markov game maximizes its expected total +discounted reward, which depends upon the policies of the other players. We +formulate a class of Markov games, termed affine Markov games, where an affine +reward function couples the players' actions. We introduce a novel solution +concept, the soft-Bellman equilibrium, where each player is boundedly rational +and chooses a soft-Bellman policy rather than a purely rational policy as in +the well-known Nash equilibrium concept. We provide conditions for the +existence and uniqueness of the soft-Bellman equilibrium and propose a +nonlinear least-squares algorithm to compute such an equilibrium in the forward +problem. We then solve the inverse game problem of inferring the players' +reward parameters from observed state-action trajectories via a +projected-gradient algorithm. Experiments in a predator-prey OpenAI Gym +environment show that the reward parameters inferred by the proposed algorithm +outperform those inferred by a baseline algorithm: they reduce the +Kullback-Leibler divergence between the equilibrium policies and observed +policies by at least two orders of magnitude. + +
+
+
+
+
+ + ♻ ☆ LoopTune: Optimizing Tensor Computations with Reinforcement Learning + + +
+ Advanced compiler technology is crucial for enabling machine learning +applications to run on novel hardware, but traditional compilers fail to +deliver performance, popular auto-tuners have long search times and +expert-optimized libraries introduce unsustainable costs. To address this, we +developed LoopTune, a deep reinforcement learning compiler that optimizes +tensor computations in deep learning models for the CPU. LoopTune optimizes +tensor traversal order while using the ultra-fast lightweight code generator +LoopNest to perform hardware-specific optimizations. With a novel graph-based +representation and action space, LoopTune speeds up LoopNest by 3.2x, +generating an order of magnitude faster code than TVM, 2.8x faster than +MetaSchedule, and 1.08x faster than AutoTVM, consistently performing at the +level of the hand-tuned library Numpy. Moreover, LoopTune tunes code in order +of seconds. + +
+
+
+
+
+ + ♻ ☆ Frequentist Regret Bounds for Randomized Least-Squares Value Iteration + + +
+ We consider the exploration-exploitation dilemma in finite-horizon +reinforcement learning (RL). When the state space is large or continuous, +traditional tabular approaches are unfeasible and some form of function +approximation is mandatory. In this paper, we introduce an +optimistically-initialized variant of the popular randomized least-squares +value iteration (RLSVI), a model-free algorithm where exploration is induced by +perturbing the least-squares approximation of the action-value function. Under +the assumption that the Markov decision process has low-rank transition +dynamics, we prove that the frequentist regret of RLSVI is upper-bounded by +$\widetilde O(d^2 H^2 \sqrt{T})$ where $ d $ are the feature dimension, $ H $ +is the horizon, and $ T $ is the total number of steps. To the best of our +knowledge, this is the first frequentist regret analysis for randomized +exploration with function approximation. + +
+
+ comment: Minor bug fixes +
+
+
+
+
+ + ♻ ☆ Streaming algorithms for evaluating noisy judges on unlabeled data -- + binary classification + + +
+ The evaluation of noisy binary classifiers on unlabeled data is treated as a +streaming task: given a data sketch of the decisions by an ensemble, estimate +the true prevalence of the labels as well as each classifier's accuracy on +them. Two fully algebraic evaluators are constructed to do this. Both are based +on the assumption that the classifiers make independent errors. The first is +based on majority voting. The second, the main contribution of the paper, is +guaranteed to be correct. But how do we know the classifiers are independent on +any given test? This principal/agent monitoring paradox is ameliorated by +exploiting the failures of the independent evaluator to return sensible +estimates. A search for nearly error independent trios is empirically carried +out on the \texttt{adult}, \texttt{mushroom}, and \texttt{two-norm} datasets by +using the algebraic failure modes to reject evaluation ensembles as too +correlated. The searches are refined by constructing a surface in evaluation +space that contains the true value point. The algebra of arbitrarily correlated +classifiers permits the selection of a polynomial subset free of any +correlation variables. Candidate evaluation ensembles are rejected if their +data sketches produce independent estimates too far from the constructed +surface. The results produced by the surviving ensembles can sometimes be as +good as 1\%. But handling even small amounts of correlation remains a +challenge. A Taylor expansion of the estimates produced when independence is +assumed but the classifiers are, in fact, slightly correlated helps clarify how +the independent evaluator has algebraic `blind spots'. + +
+
+ comment: 25 pages, 5 figures. Added extensive discussion about the Platanios + agreement equations and how the independent solution from them is not correct +
+
+
+
+
+ + ♻ ☆ Accurate Neural Network Pruning Requires Rethinking Sparse Optimization + + +
+ Obtaining versions of deep neural networks that are both highly-accurate and +highly-sparse is one of the main challenges in the area of model compression, +and several high-performance pruning techniques have been investigated by the +community. Yet, much less is known about the interaction between sparsity and +the standard stochastic optimization techniques used for training sparse +networks, and most existing work uses standard dense schedules and +hyperparameters for training sparse networks. In this work, we examine the +impact of high sparsity on model training using the standard computer vision +and natural language processing sparsity benchmarks. We begin by showing that +using standard dense training recipes for sparse training is suboptimal, and +results in under-training. We provide new approaches for mitigating this issue +for both sparse pre-training of vision models (e.g. ResNet50/ImageNet) and +sparse fine-tuning of language models (e.g. BERT/GLUE), achieving +state-of-the-art results in both settings in the high-sparsity regime, and +providing detailed analyses for the difficulty of sparse training in both +scenarios. Our work sets a new threshold in terms of the accuracies that can be +achieved under high sparsity, and should inspire further research into +improving sparse model training, to reach higher accuracies under high +sparsity, but also to do so efficiently. + +
+
+
+
+
+ + ♻ ☆ Graph Neural Network-enabled Terahertz-based Flow-guided Nanoscale + Localization + + +
+ Scientific advancements in nanotechnology and advanced materials are paving +the way toward nanoscale devices for in-body precision medicine; comprising +integrated sensing, computing, communication, data and energy storage +capabilities. In the human cardiovascular system, such devices are envisioned +to be passively flowing and continuously sensing for detecting events of +diagnostic interest. The diagnostic value of detecting such events can be +enhanced by assigning to them their physical locations (e.g., body region), +which is the main proposition of flow-guided localization. Current flow-guided +localization approaches suffer from low localization accuracy and they are +by-design unable to localize events within the entire cardiovascular system. +Toward addressing this issue, we propose the utilization of Graph Neural +Networks (GNNs) for this purpose, and demonstrate localization accuracy and +coverage enhancements of our proposal over the existing State of the Art (SotA) +approaches. Based on our evaluation, we provide several design guidelines for +GNN-enabled flow-guided localization. + +
+
+ comment: 6 pages, 5 figures, 1 table, 15 references. arXiv admin note: text + overlap with arXiv:2305.18493 +
+
+
+
+
+ + ♻ ☆ TREE-G: Decision Trees Contesting Graph Neural Networks + + +
+ When dealing with tabular data, models based on decision trees are a popular +choice due to their high accuracy on these data types, their ease of +application, and explainability properties. However, when it comes to +graph-structured data, it is not clear how to apply them effectively, in a way +that incorporates the topological information with the tabular data available +on the vertices of the graph. To address this challenge, we introduce TREE-G. +TREE-G modifies standard decision trees, by introducing a novel split function +that is specialized for graph data. Not only does this split function +incorporate the node features and the topological information, but it also uses +a novel pointer mechanism that allows split nodes to use information computed +in previous splits. Therefore, the split function adapts to the predictive task +and the graph at hand. We analyze the theoretical properties of TREE-G and +demonstrate its benefits empirically on multiple graph and vertex prediction +benchmarks. In these experiments, TREE-G consistently outperforms other +tree-based models and often outperforms other graph-learning algorithms such as +Graph Neural Networks (GNNs) and Graph Kernels, sometimes by large margins. +Moreover, TREE-Gs models and their predictions can be explained and visualized + +
+
+
+
+
+ + ♻ ☆ Leveraging the Potential of Novel Data in Power Line Communication of + Electricity Grids + + +
+ Electricity grids have become an essential part of daily life, even if they +are often not noticed in everyday life. We usually only become particularly +aware of this dependence by the time the electricity grid is no longer +available. However, significant changes, such as the transition to renewable +energy (photovoltaic, wind turbines, etc.) and an increasing number of energy +consumers with complex load profiles (electric vehicles, home battery systems, +etc.), pose new challenges for the electricity grid. To address these +challenges, we propose two first-of-its-kind datasets based on measurements in +a broadband powerline communications (PLC) infrastructure. Both datasets FiN-1 +and FiN-2, were collected during real practical use in a part of the German +low-voltage grid that supplies around 4.4 million people and show more than 13 +billion datapoints collected by more than 5100 sensors. In addition, we present +different use cases in asset management, grid state visualization, forecasting, +predictive maintenance, and novelty detection to highlight the benefits of +these types of data. For these applications, we particularly highlight the use +of novel machine learning architectures to extract rich information from +real-world data that cannot be captured using traditional approaches. By +publishing the first large-scale real-world dataset, we aim to shed light on +the previously largely unrecognized potential of PLC data and emphasize +machine-learning-based research in low-voltage distribution networks by +presenting a variety of different use cases. + +
+
+
+
+
+ + ♻ ☆ Knowledge-Driven Multi-Agent Reinforcement Learning for Computation + Offloading in Cybertwin-Enabled Internet of Vehicles + + +
+ By offloading computation-intensive tasks of vehicles to roadside units +(RSUs), mobile edge computing (MEC) in the Internet of Vehicles (IoV) can +relieve the onboard computation burden. However, existing model-based task +offloading methods suffer from heavy computational complexity with the increase +of vehicles and data-driven methods lack interpretability. To address these +challenges, in this paper, we propose a knowledge-driven multi-agent +reinforcement learning (KMARL) approach to reduce the latency of task +offloading in cybertwin-enabled IoV. Specifically, in the considered scenario, +the cybertwin serves as a communication agent for each vehicle to exchange +information and make offloading decisions in the virtual space. To reduce the +latency of task offloading, a KMARL approach is proposed to select the optimal +offloading option for each vehicle, where graph neural networks are employed by +leveraging domain knowledge concerning graph-structure communication topology +and permutation invariance into neural networks. Numerical results show that +our proposed KMARL yields higher rewards and demonstrates improved scalability +compared with other methods, benefitting from the integration of domain +knowledge. + +
+
+
+
+
+ + ♻ ☆ LadleNet: Translating Thermal Infrared Images to Visible Light Images + Using A Scalable Two-stage U-Net + + +
+ The translation of thermal infrared (TIR) images to visible light (VI) images +presents a challenging task with potential applications spanning various +domains such as TIR-VI image registration and fusion. Leveraging supplementary +information derived from TIR image conversions can significantly enhance model +performance and generalization across these applications. However, prevailing +issues within this field include suboptimal image fidelity and limited model +scalability. In this paper, we introduce an algorithm, LadleNet, based on the +U-Net architecture. LadleNet employs a two-stage U-Net concatenation structure, +augmented with skip connections and refined feature aggregation techniques, +resulting in a substantial enhancement in model performance. Comprising +'Handle' and 'Bowl' modules, LadleNet's Handle module facilitates the +construction of an abstract semantic space, while the Bowl module decodes this +semantic space to yield mapped VI images. The Handle module exhibits +extensibility by allowing the substitution of its network architecture with +semantic segmentation networks, thereby establishing more abstract semantic +spaces to bolster model performance. Consequently, we propose LadleNet+, which +replaces LadleNet's Handle module with the pre-trained DeepLabv3+ network, +thereby endowing the model with enhanced semantic space construction +capabilities. The proposed method is evaluated and tested on the KAIST dataset, +accompanied by quantitative and qualitative analyses. Compared to existing +methodologies, our approach achieves state-of-the-art performance in terms of +image clarity and perceptual quality. The source code will be made available at +https://github.com/Ach-1914/LadleNet/tree/main/. + +
+
+
+
+
+ + ♻ ☆ Improving Expressivity of Graph Neural Networks using Localization + + +
+ In this paper, we propose localized versions of Weisfeiler-Leman (WL) +algorithms in an effort to both increase the expressivity, as well as decrease +the computational overhead. We focus on the specific problem of subgraph +counting and give localized versions of $k-$WL for any $k$. We analyze the +power of Local $k-$WL and prove that it is more expressive than $k-$WL and at +most as expressive as $(k+1)-$WL. We give a characterization of patterns whose +count as a subgraph and induced subgraph are invariant if two graphs are Local +$k-$WL equivalent. We also introduce two variants of $k-$WL: Layer $k-$WL and +recursive $k-$WL. These methods are more time and space efficient than applying +$k-$WL on the whole graph. We also propose a fragmentation technique that +guarantees the exact count of all induced subgraphs of size at most 4 using +just $1-$WL. The same idea can be extended further for larger patterns using +$k>1$. We also compare the expressive power of Local $k-$WL with other GNN +hierarchies and show that given a bound on the time-complexity, our methods are +more expressive than the ones mentioned in Papp and Wattenhofer[2022a]. + +
+
+
+
+
+ + ♻ ☆ Learning a Consensus Sub-Network with Polarization Regularization and + One Pass Training + + +
+ The subject of green AI has been gaining attention within the deep learning +community given the recent trend of ever larger and more complex neural network +models. Existing solutions for reducing the computational load of training at +inference time usually involve pruning the network parameters. Pruning schemes +often create extra overhead either by iterative training and fine-tuning for +static pruning or repeated computation of a dynamic pruning graph. We propose a +new parameter pruning strategy for learning a lighter-weight sub-network that +minimizes the energy cost while maintaining comparable performance to the fully +parameterised network on given downstream tasks. Our proposed pruning scheme is +green-oriented, as it only requires a one-off training to discover the optimal +static sub-networks by dynamic pruning methods. The pruning scheme consists of +a binary gating module and a novel loss function to uncover sub-networks with +user-defined sparsity. Our method enables pruning and training simultaneously, +which saves energy in both the training and inference phases and avoids extra +computational overhead from gating modules at inference time. Our results on +CIFAR-10 and CIFAR-100 suggest that our scheme can remove 50% of connections in +deep networks with less than 1% reduction in classification accuracy. Compared +to other related pruning methods, our method demonstrates a lower drop in +accuracy for equivalent reductions in computational cost. + +
+
+
+
+
+ + ♻ ☆ Driver Profiling and Bayesian Workload Estimation Using Naturalistic + Peripheral Detection Study Data + + +
+ Monitoring drivers' mental workload facilitates initiating and maintaining +safe interactions with in-vehicle information systems, and thus delivers +adaptive human machine interaction with reduced impact on the primary task of +driving. In this paper, we tackle the problem of workload estimation from +driving performance data. First, we present a novel on-road study for +collecting subjective workload data via a modified peripheral detection task in +naturalistic settings. Key environmental factors that induce a high mental +workload are identified via video analysis, e.g. junctions and behaviour of +vehicle in front. Second, a supervised learning framework using +state-of-the-art time series classifiers (e.g. convolutional neural network and +transform techniques) is introduced to profile drivers based on the average +workload they experience during a journey. A Bayesian filtering approach is +then proposed for sequentially estimating, in (near) real-time, the driver's +instantaneous workload. This computationally efficient and flexible method can +be easily personalised to a driver (e.g. incorporate their inferred average +workload profile), adapted to driving/environmental contexts (e.g. road type) +and extended with data streams from new sources. The efficacy of the presented +profiling and instantaneous workload estimation approaches are demonstrated +using the on-road study data, showing $F_{1}$ scores of up to 92% and 81%, +respectively. + +
+
+ comment: Accepted for IEEE Transactions on Intelligent Vehicles +
+
+
+
+
+ + ♻ ☆ Information Processing Equalities and the Information-Risk Bridge + + +
+ We introduce two new classes of measures of information for statistical +experiments which generalise and subsume $\phi$-divergences, integral +probability metrics, $\mathfrak{N}$-distances (MMD), and $(f,\Gamma)$ +divergences between two or more distributions. This enables us to derive a +simple geometrical relationship between measures of information and the Bayes +risk of a statistical decision problem, thus extending the variational +$\phi$-divergence representation to multiple distributions in an entirely +symmetric manner. The new families of divergence are closed under the action of +Markov operators which yields an information processing equality which is a +refinement and generalisation of the classical data processing inequality. This +equality gives insight into the significance of the choice of the hypothesis +class in classical risk minimization. + +
+
+ comment: 48 pages; corrected some typos and added a few additional + explanations +
+
+
+
+
+ + ♻ ☆ Avoid Adversarial Adaption in Federated Learning by Multi-Metric + Investigations + + +
+ Federated Learning (FL) facilitates decentralized machine learning model +training, preserving data privacy, lowering communication costs, and boosting +model performance through diversified data sources. Yet, FL faces +vulnerabilities such as poisoning attacks, undermining model integrity with +both untargeted performance degradation and targeted backdoor attacks. +Preventing backdoors proves especially challenging due to their stealthy +nature. + Prominent mitigation techniques against poisoning attacks rely on monitoring +certain metrics and filtering malicious model updates. While shown effective in +evaluations, we argue that previous works didn't consider realistic real-world +adversaries and data distributions. We define a new notion of strong adaptive +adversaries, capable of adapting to multiple objectives simultaneously. Through +extensive empirical tests, we show that existing defense methods can be easily +circumvented in this adversary model. We also demonstrate, that existing +defenses have limited effectiveness when no assumptions are made about +underlying data distributions. + We introduce Metric-Cascades (MESAS), a novel defense method for more +realistic scenarios and adversary models. MESAS employs multiple detection +metrics simultaneously to identify poisoned model updates, creating a complex +multi-objective optimization problem for adaptive attackers. In our extensive +evaluation featuring nine backdoors and three datasets, MESAS consistently +detects even strong adaptive attackers. Furthermore, MESAS outperforms existing +defenses in distinguishing backdoors from data distribution-related distortions +within and across clients. MESAS is the first defense robust against strong +adaptive adversaries, effective in real-world data scenarios, with an average +overhead of just 24.37 seconds. + +
+
+ comment: 25 pages, 14 figures, 23 tables, 11 equations +
+
+
+
+
+ + ♻ ☆ Kernelized Normalizing Flows + + +
+ Normalising Flows are generative models characterised by their invertible +architecture. However, the requirement of invertibility imposes constraints on +their expressiveness, necessitating a large number of parameters and innovative +architectural designs to achieve satisfactory outcomes. Whilst flow-based +models predominantly rely on neural-network-based transformations for +expressive designs, alternative transformation methods have received limited +attention. In this work, we present Ferumal flow, a novel kernelised +normalising flow paradigm that integrates kernels into the framework. Our +results demonstrate that a kernelised flow can yield competitive or superior +results compared to neural network-based flows whilst maintaining parameter +efficiency. Kernelised flows excel especially in the low-data regime, enabling +flexible non-parametric density estimation in applications with sparse data +availability. + +
+
+
+
+
+ + ♻ ☆ Verifiable Learning for Robust Tree Ensembles CCS 2023 + + +
+ Verifying the robustness of machine learning models against evasion attacks +at test time is an important research problem. Unfortunately, prior work +established that this problem is NP-hard for decision tree ensembles, hence +bound to be intractable for specific inputs. In this paper, we identify a +restricted class of decision tree ensembles, called large-spread ensembles, +which admit a security verification algorithm running in polynomial time. We +then propose a new approach called verifiable learning, which advocates the +training of such restricted model classes which are amenable for efficient +verification. We show the benefits of this idea by designing a new training +algorithm that automatically learns a large-spread decision tree ensemble from +labelled data, thus enabling its security verification in polynomial time. +Experimental results on public datasets confirm that large-spread ensembles +trained using our algorithm can be verified in a matter of seconds, using +standard commercial hardware. Moreover, large-spread ensembles are more robust +than traditional ensembles against evasion attacks, at the cost of an +acceptable loss of accuracy in the non-adversarial setting. + +
+
+ comment: 19 pages, 5 figures; full version of the revised paper accepted at + ACM CCS 2023 +
+
+
+
+
+ + ♻ ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborate their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. To +explore the potential of open-source LLM, we investigate them in various +scenarios, and further enhance their performance with supervised fine-tuning. +Our explorations highlight open-source LLMs' potential in Text-to-SQL, as well +as the advantages and disadvantages of the supervised fine-tuning. +Additionally, towards an efficient and economic LLM-based Text-to-SQL solution, +we emphasize the token efficiency in prompt engineering and compare the prior +studies under this metric. We hope that our work provides a deeper +understanding of Text-to-SQL with LLMs, and inspires further investigations and +broad applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ♻ ☆ On the Robustness of Post-hoc GNN Explainers to Label Noise + + +
+ Proposed as a solution to the inherent black-box limitations of graph neural +networks (GNNs), post-hoc GNN explainers aim to provide precise and insightful +explanations of the behaviours exhibited by trained GNNs. Despite their recent +notable advancements in academic and industrial contexts, the robustness of +post-hoc GNN explainers remains unexplored when confronted with label noise. To +bridge this gap, we conduct a systematic empirical investigation to evaluate +the efficacy of diverse post-hoc GNN explainers under varying degrees of label +noise. Our results reveal several key insights: Firstly, post-hoc GNN +explainers are susceptible to label perturbations. Secondly, even minor levels +of label noise, inconsequential to GNN performance, harm the quality of +generated explanations substantially. Lastly, we engage in a discourse +regarding the progressive recovery of explanation effectiveness with escalating +noise levels. + +
+
+
+
+
+ + ♻ ☆ Neuroevolution is a Competitive Alternative to Reinforcement Learning + for Skill Discovery ICLR2023 + + +
+ Deep Reinforcement Learning (RL) has emerged as a powerful paradigm for +training neural policies to solve complex control tasks. However, these +policies tend to be overfit to the exact specifications of the task and +environment they were trained on, and thus do not perform well when conditions +deviate slightly or when composed hierarchically to solve even more complex +tasks. Recent work has shown that training a mixture of policies, as opposed to +a single one, that are driven to explore different regions of the state-action +space can address this shortcoming by generating a diverse set of behaviors, +referred to as skills, that can be collectively used to great effect in +adaptation tasks or for hierarchical planning. This is typically realized by +including a diversity term - often derived from information theory - in the +objective function optimized by RL. However these approaches often require +careful hyperparameter tuning to be effective. In this work, we demonstrate +that less widely-used neuroevolution methods, specifically Quality Diversity +(QD), are a competitive alternative to information-theory-augmented RL for +skill discovery. Through an extensive empirical evaluation comparing eight +state-of-the-art algorithms (four flagship algorithms from each line of work) +on the basis of (i) metrics directly evaluating the skills' diversity, (ii) the +skills' performance on adaptation tasks, and (iii) the skills' performance when +used as primitives for hierarchical planning; QD methods are found to provide +equal, and sometimes improved, performance whilst being less sensitive to +hyperparameters and more scalable. As no single method is found to provide +near-optimal performance across all environments, there is a rich scope for +further research which we support by proposing future directions and providing +optimized open-source implementations. + +
+
+ comment: Camera ready version for ICLR2023 (spotlight) +
+
+
+
+
+ + ♻ ☆ A Robust Negative Learning Approach to Partial Domain Adaptation Using + Source Prototypes + + +
+ This work proposes a robust Partial Domain Adaptation (PDA) framework that +mitigates the negative transfer problem by incorporating a robust +target-supervision strategy. It leverages ensemble learning and includes +diverse, complementary label feedback, alleviating the effect of incorrect +feedback and promoting pseudo-label refinement. Rather than relying exclusively +on first-order moments for distribution alignment, our approach offers explicit +objectives to optimize intra-class compactness and inter-class separation with +the inferred source prototypes and highly-confident target samples in a +domain-invariant fashion. Notably, we ensure source data privacy by eliminating +the need to access the source data during the adaptation phase through a priori +inference of source prototypes. We conducted a series of comprehensive +experiments, including an ablation analysis, covering a range of partial domain +adaptation tasks. Comprehensive evaluations on benchmark datasets corroborate +our framework's enhanced robustness and generalization, demonstrating its +superiority over existing state-of-the-art PDA approaches. + +
+
+
+
+
+ + ♻ ☆ Quantum-Inspired Machine Learning: a Survey + + +
+ Quantum-inspired Machine Learning (QiML) is a burgeoning field, receiving +global attention from researchers for its potential to leverage principles of +quantum mechanics within classical computational frameworks. However, current +review literature often presents a superficial exploration of QiML, focusing +instead on the broader Quantum Machine Learning (QML) field. In response to +this gap, this survey provides an integrated and comprehensive examination of +QiML, exploring QiML's diverse research domains including tensor network +simulations, dequantized algorithms, and others, showcasing recent +advancements, practical applications, and illuminating potential future +research avenues. Further, a concrete definition of QiML is established by +analyzing various prior interpretations of the term and their inherent +ambiguities. As QiML continues to evolve, we anticipate a wealth of future +developments drawing from quantum mechanics, quantum computing, and classical +machine learning, enriching the field further. This survey serves as a guide +for researchers and practitioners alike, providing a holistic understanding of +QiML's current landscape and future directions. + +
+
+ comment: 59 pages, 13 figures, 9 tables. - Edited for spelling, grammar, and + corrected minor typos in formulas - Adjusted wording in places for better + clarity - Corrected contact info - Added Table 1 to clarify variables used in + dequantized algs. - Added subsections in QVAS discussing QCBMs and TN-based + VQC models - Included additional references as requested by authors to ensure + a more exhaustive survey +
+
+
+
+
+ + ♻ ☆ Heterogeneous Federated Learning: State-of-the-art and Research + Challenges + + +
+ Federated learning (FL) has drawn increasing attention owing to its potential +use in large-scale industrial applications. Existing federated learning works +mainly focus on model homogeneous settings. However, practical federated +learning typically faces the heterogeneity of data distributions, model +architectures, network environments, and hardware devices among participant +clients. Heterogeneous Federated Learning (HFL) is much more challenging, and +corresponding solutions are diverse and complex. Therefore, a systematic survey +on this topic about the research challenges and state-of-the-art is essential. +In this survey, we firstly summarize the various research challenges in HFL +from five aspects: statistical heterogeneity, model heterogeneity, +communication heterogeneity, device heterogeneity, and additional challenges. +In addition, recent advances in HFL are reviewed and a new taxonomy of existing +HFL methods is proposed with an in-depth analysis of their pros and cons. We +classify existing methods from three different levels according to the HFL +procedure: data-level, model-level, and server-level. Finally, several critical +and promising future research directions in HFL are discussed, which may +facilitate further developments in this field. A periodically updated +collection on HFL is available at https://github.com/marswhu/HFL_Survey. + +
+
+ comment: 42 pages, 11 figures, and 4 tables +
+
+
+
+
+ + ♻ ☆ Multi-task UNet architecture for end-to-end autonomous driving + + +
+ We propose an end-to-end driving model that integrates a multi-task UNet +(MTUNet) architecture and control algorithms in a pipeline of data flow from a +front camera through this model to driving decisions. It provides quantitative +measures to evaluate the holistic, dynamic, and real-time performance of +end-to-end driving systems and thus the safety and interpretability of MTUNet. +The architecture consists of one segmentation, one regression, and two +classification tasks for lane segmentation, path prediction, and vehicle +controls. We present three variants of the architecture having different +complexities, compare them on different tasks in four static measures for both +single and multiple tasks, and then identify the best one by two additional +dynamic measures in real-time simulation. Our results show that the performance +of the proposed supervised learning model is comparable to that of a +reinforcement learning model on curvy roads for the same task, which is not +end-to-end but multi-module. + +
+
+ comment: 6 pages, 5 figures, a condensation of the previous version +
+
+
+
+
+ + ♻ ☆ Generating and Detecting True Ambiguity: A Forgotten Danger in DNN + Supervision Testing + + +
+ Deep Neural Networks (DNNs) are becoming a crucial component of modern +software systems, but they are prone to fail under conditions that are +different from the ones observed during training (out-of-distribution inputs) +or on inputs that are truly ambiguous, i.e., inputs that admit multiple classes +with nonzero probability in their labels. Recent work proposed DNN supervisors +to detect high-uncertainty inputs before their possible misclassification leads +to any harm. To test and compare the capabilities of DNN supervisors, +researchers proposed test generation techniques, to focus the testing effort on +high-uncertainty inputs that should be recognized as anomalous by supervisors. +However, existing test generators aim to produce out-of-distribution inputs. No +existing model- and supervisor independent technique targets the generation of +truly ambiguous test inputs, i.e., inputs that admit multiple classes according +to expert human judgment. + In this paper, we propose a novel way to generate ambiguous inputs to test +DNN supervisors and used it to empirically compare several existing supervisor +techniques. In particular, we propose AmbiGuess to generate ambiguous samples +for image classification problems. AmbiGuess is based on gradient-guided +sampling in the latent space of a regularized adversarial autoencoder. +Moreover, we conducted what is -- to the best of our knowledge -- the most +extensive comparative study of DNN supervisors, considering their capabilities +to detect 4 distinct types of high-uncertainty inputs, including truly +ambiguous ones. We find that the tested supervisors' capabilities are +complementary: Those best suited to detect true ambiguity perform worse on +invalid, out-of-distribution and adversarial inputs and vice-versa. + +
+
+ comment: Accepted for publication at Springers "Empirical Software + Engineering" (EMSE) +
+
+
+
+
+ + ♻ ☆ An Empirical Evaluation of Temporal Graph Benchmark + + +
+ In this paper, we conduct an empirical evaluation of Temporal Graph Benchmark +(TGB) by extending our Dynamic Graph Library (DyGLib) to TGB. Compared with +TGB, we include eleven popular dynamic graph learning methods for more +exhaustive comparisons. Through the experiments, we find that (1) different +models depict varying performance across various datasets, which is in line +with previous observations; (2) the performance of some baselines can be +significantly improved over the reported results in TGB when using DyGLib. This +work aims to ease the researchers' efforts in evaluating various dynamic graph +learning methods on TGB and attempts to offer results that can be directly +referenced in the follow-up research. All the used resources in this project +are publicly available at https://github.com/yule-BUAA/DyGLib_TGB. This work is +in progress, and feedback from the community is welcomed for improvements. + +
+
+ comment: preprint, in progress, more results are added +
+
+
+
+
+ + ♻ ☆ EENED: End-to-End Neural Epilepsy Detection based on Convolutional + Transformer + + +
+ Recently Transformer and Convolution neural network (CNN) based models have +shown promising results in EEG signal processing. Transformer models can +capture the global dependencies in EEG signals through a self-attention +mechanism, while CNN models can capture local features such as sawtooth waves. +In this work, we propose an end-to-end neural epilepsy detection model, EENED, +that combines CNN and Transformer. Specifically, by introducing the convolution +module into the Transformer encoder, EENED can learn the time-dependent +relationship of the patient's EEG signal features and notice local EEG abnormal +mutations closely related to epilepsy, such as the appearance of spikes and the +sprinkling of sharp and slow waves. Our proposed framework combines the ability +of Transformer and CNN to capture different scale features of EEG signals and +holds promise for improving the accuracy and reliability of epilepsy detection. +Our source code will be released soon on GitHub. + +
+
+ comment: Accepted by IEEE CAI 2023 +
+
+
+
+
+ + ♻ ☆ Classification of integers based on residue classes via modern deep + learning algorithms + + +
+ Judging whether an integer can be divided by prime numbers such as 2 or 3 may +appear trivial to human beings, but can be less straightforward for computers. +Here, we tested multiple deep learning architectures and feature engineering +approaches on classifying integers based on their residues when divided by +small prime numbers. We found that the ability of classification critically +depends on the feature space. We also evaluated Automated Machine Learning +(AutoML) platforms from Amazon, Google and Microsoft, and found that they +failed on this task without appropriately engineered features. Furthermore, we +introduced a method that utilizes linear regression on Fourier series basis +vectors, and demonstrated its effectiveness. Finally, we evaluated Large +Language Models (LLMs) such as GPT-4, GPT-J, LLaMA and Falcon, and demonstrated +their failures. In conclusion, feature engineering remains an important task to +improve performance and increase interpretability of machine-learning models, +even in the era of AutoML and LLMs. + +
+
+ comment: Accepted at Patterns +
+
+
+
+
+ + ♻ ☆ Network Revenue Management with Demand Learning and Fair + Resource-Consumption Balancing + + +
+ In addition to maximizing the total revenue, decision-makers in lots of +industries would like to guarantee balanced consumption across different +resources. For instance, in the retailing industry, ensuring a balanced +consumption of resources from different suppliers enhances fairness and helps +main a healthy channel relationship; in the cloud computing industry, +resource-consumption balance helps increase customer satisfaction and reduce +operational costs. Motivated by these practical needs, this paper studies the +price-based network revenue management (NRM) problem with both demand learning +and fair resource-consumption balancing. We introduce the regularized revenue, +i.e., the total revenue with a balancing regularization, as our objective to +incorporate fair resource-consumption balancing into the revenue maximization +goal. We propose a primal-dual-type online policy with the +Upper-Confidence-Bound (UCB) demand learning method to maximize the regularized +revenue. We adopt several innovative techniques to make our algorithm a unified +and computationally efficient framework for the continuous price set and a wide +class of balancing regularizers. Our algorithm achieves a worst-case regret of +$\widetilde O(N^{5/2}\sqrt{T})$, where $N$ denotes the number of products and +$T$ denotes the number of time periods. Numerical experiments in a few NRM +examples demonstrate the effectiveness of our algorithm in simultaneously +achieving revenue maximization and fair resource-consumption balancing + +
+
+ comment: Forthcoming in Production and Operations Management. The original + title is Fairness-aware Network Revenue Management With Demand Learning +
+
+
+
+
+ + ♻ ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ The reasoning capabilities of Large Language Models (LLMs) play a pivotal +role in the realm of embodied artificial intelligence. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Off-policy Evaluation in Doubly Inhomogeneous Environments + + +
+ This work aims to study off-policy evaluation (OPE) under scenarios where two +key reinforcement learning (RL) assumptions -- temporal stationarity and +individual homogeneity are both violated. To handle the ``double +inhomogeneities", we propose a class of latent factor models for the reward and +observation transition functions, under which we develop a general OPE +framework that consists of both model-based and model-free approaches. To our +knowledge, this is the first paper that develops statistically sound OPE +methods in offline RL with double inhomogeneities. It contributes to a deeper +understanding of OPE in environments, where standard RL assumptions are not +met, and provides several practical approaches in these settings. We establish +the theoretical properties of the proposed value estimators and empirically +show that our approach outperforms competing methods that ignore either +temporal nonstationarity or individual heterogeneity. Finally, we illustrate +our method on a data set from the Medical Information Mart for Intensive Care. + +
+
+
+
+
+ + ♻ ☆ Self-supervised learning-based general laboratory progress pretrained + model for cardiovascular event detection + + +
+ The inherent nature of patient data poses several challenges. Prevalent cases +amass substantial longitudinal data owing to their patient volume and +consistent follow-ups, however, longitudinal laboratory data are renowned for +their irregularity, temporality, absenteeism, and sparsity; In contrast, +recruitment for rare or specific cases is often constrained due to their +limited patient size and episodic observations. This study employed +self-supervised learning (SSL) to pretrain a generalized laboratory progress +(GLP) model that captures the overall progression of six common laboratory +markers in prevalent cardiovascular cases, with the intention of transferring +this knowledge to aid in the detection of specific cardiovascular event. GLP +implemented a two-stage training approach, leveraging the information embedded +within interpolated data and amplify the performance of SSL. After GLP +pretraining, it is transferred for TVR detection. The proposed two-stage +training improved the performance of pure SSL, and the transferability of GLP +exhibited distinctiveness. After GLP processing, the classification exhibited a +notable enhancement, with averaged accuracy rising from 0.63 to 0.90. All +evaluated metrics demonstrated substantial superiority (p < 0.01) compared to +prior GLP processing. Our study effectively engages in translational +engineering by transferring patient progression of cardiovascular laboratory +parameters from one patient group to another, transcending the limitations of +data availability. The transferability of disease progression optimized the +strategies of examinations and treatments, and improves patient prognosis while +using commonly available laboratory parameters. The potential for expanding +this approach to encompass other diseases holds great promise. + +
+
+ comment: published in IEEE Journal of Translational Engineering in Health & + Medicine +
+
+
+
+
+ + ♻ ☆ A Survey on Privacy in Graph Neural Networks: Attacks, Preservation, and + Applications + + +
+ Graph Neural Networks (GNNs) have gained significant attention owing to their +ability to handle graph-structured data and the improvement in practical +applications. However, many of these models prioritize high utility +performance, such as accuracy, with a lack of privacy consideration, which is a +major concern in modern society where privacy attacks are rampant. To address +this issue, researchers have started to develop privacy-preserving GNNs. +Despite this progress, there is a lack of a comprehensive overview of the +attacks and the techniques for preserving privacy in the graph domain. In this +survey, we aim to address this gap by summarizing the attacks on graph data +according to the targeted information, categorizing the privacy preservation +techniques in GNNs, and reviewing the datasets and applications that could be +used for analyzing/solving privacy issues in GNNs. We also outline potential +directions for future research in order to build better privacy-preserving +GNNs. + +
+
+
+
+
+ + ♻ ☆ GPU-Accelerated Verification of Machine Learning Models for Power + Systems + + +
+ Computational tools for rigorously verifying the performance of large-scale +machine learning (ML) models have progressed significantly in recent years. The +most successful solvers employ highly specialized, GPU-accelerated branch and +bound routines. Such tools are crucial for the successful deployment of machine +learning applications in safety-critical systems, such as power systems. +Despite their successes, however, barriers prevent out-of-the-box application +of these routines to power system problems. This paper addresses this issue in +two key ways. First, for the first time to our knowledge, we enable the +simultaneous verification of multiple verification problems (e.g., checking for +the violation of all line flow constraints simultaneously and not by solving +individual verification problems). For that, we introduce an exact +transformation that converts the "worst-case" violation across a set of +potential violations to a series of ReLU-based layers that augment the original +neural network. This allows verifiers to interpret them directly. Second, power +system ML models often must be verified to satisfy power flow constraints. We +propose a dualization procedure which encodes linear equality and inequality +constraints directly into the verification problem; and in a manner which is +mathematically consistent with the specialized verification tools. To +demonstrate these innovations, we verify problems associated with data-driven +security constrained DC-OPF solvers. We build and test our first set of +innovations using the $\alpha,\beta$-CROWN solver, and we benchmark against +Gurobi 10.0. Our contributions achieve a speedup that can exceed 100x and allow +higher degrees of verification flexibility. + +
+
+
+
+
+ + ♻ ☆ Representation Learning via Manifold Flattening and Reconstruction + + +
+ This work proposes an algorithm for explicitly constructing a pair of neural +networks that linearize and reconstruct an embedded submanifold, from finite +samples of this manifold. Our such-generated neural networks, called Flattening +Networks (FlatNet), are theoretically interpretable, computationally feasible +at scale, and generalize well to test data, a balance not typically found in +manifold-based learning methods. We present empirical results and comparisons +to other models on synthetic high-dimensional manifold data and 2D image data. +Our code is publicly available. + +
+
+ comment: 44 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ Two-step hyperparameter optimization method: Accelerating hyperparameter + search by using a fraction of a training dataset + + +
+ Hyperparameter optimization (HPO) is an important step in machine learning +(ML) model development, but common practices are archaic -- primarily relying +on manual or grid searches. This is partly because adopting advanced HPO +algorithms introduces added complexity to the workflow, leading to longer +computation times. This poses a notable challenge to ML applications, as +suboptimal hyperparameter selections curtail the potential of ML model +performance, ultimately obstructing the full exploitation of ML techniques. In +this article, we present a two-step HPO method as a strategic solution to +curbing computational demands and wait times, gleaned from practical +experiences in applied ML parameterization work. The initial phase involves a +preliminary evaluation of hyperparameters on a small subset of the training +dataset, followed by a re-evaluation of the top-performing candidate models +post-retraining with the entire training dataset. This two-step HPO method is +universally applicable across HPO search algorithms, and we argue it has +attractive efficiency gains. + As a case study, we present our recent application of the two-step HPO method +to the development of neural network emulators for aerosol activation. Although +our primary use case is a data-rich limit with many millions of samples, we +also find that using up to 0.0025% of the data (a few thousand samples) in the +initial step is sufficient to find optimal hyperparameter configurations from +much more extensive sampling, achieving up to 135-times speedup. The benefits +of this method materialize through an assessment of hyperparameters and model +performance, revealing the minimal model complexity required to achieve the +best performance. The assortment of top-performing models harvested from the +HPO process allows us to choose a high-performing model with a low inference +cost for efficient use in global climate models (GCMs). + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Parallel and Limited Data Voice Conversion Using Stochastic Variational + Deep Kernel Learning + + +
+ Typically, voice conversion is regarded as an engineering problem with +limited training data. The reliance on massive amounts of data hinders the +practical applicability of deep learning approaches, which have been +extensively researched in recent years. On the other hand, statistical methods +are effective with limited data but have difficulties in modelling complex +mapping functions. This paper proposes a voice conversion method that works +with limited data and is based on stochastic variational deep kernel learning +(SVDKL). At the same time, SVDKL enables the use of deep neural networks' +expressive capability as well as the high flexibility of the Gaussian process +as a Bayesian and non-parametric method. When the conventional kernel is +combined with the deep neural network, it is possible to estimate non-smooth +and more complex functions. Furthermore, the model's sparse variational +Gaussian process solves the scalability problem and, unlike the exact Gaussian +process, allows for the learning of a global mapping function for the entire +acoustic space. One of the most important aspects of the proposed scheme is +that the model parameters are trained using marginal likelihood optimization, +which considers both data fitting and model complexity. Considering the +complexity of the model reduces the amount of training data by increasing the +resistance to overfitting. To evaluate the proposed scheme, we examined the +model's performance with approximately 80 seconds of training data. The results +indicated that our method obtained a higher mean opinion score, smaller +spectral distortion, and better preference tests than the compared methods. + +
+
+
+
+
+ + ☆ Towards Efficient SDRTV-to-HDRTV by Learning from Image Formation + + +
+ Modern displays are capable of rendering video content with high dynamic +range (HDR) and wide color gamut (WCG). However, the majority of available +resources are still in standard dynamic range (SDR). As a result, there is +significant value in transforming existing SDR content into the HDRTV standard. +In this paper, we define and analyze the SDRTV-to-HDRTV task by modeling the +formation of SDRTV/HDRTV content. Our analysis and observations indicate that a +naive end-to-end supervised training pipeline suffers from severe gamut +transition errors. To address this issue, we propose a novel three-step +solution pipeline called HDRTVNet++, which includes adaptive global color +mapping, local enhancement, and highlight refinement. The adaptive global color +mapping step uses global statistics as guidance to perform image-adaptive color +mapping. A local enhancement network is then deployed to enhance local details. +Finally, we combine the two sub-networks above as a generator and achieve +highlight consistency through GAN-based joint training. Our method is primarily +designed for ultra-high-definition TV content and is therefore effective and +lightweight for processing 4K resolution images. We also construct a dataset +using HDR videos in the HDR10 standard, named HDRTV1K that contains 1235 and +117 training images and 117 testing images, all in 4K resolution. Besides, we +select five metrics to evaluate the results of SDRTV-to-HDRTV algorithms. Our +final results demonstrate state-of-the-art performance both quantitatively and +visually. The code, model and dataset are available at +https://github.com/xiaom233/HDRTVNet-plus. + +
+
+ comment: Extended version of HDRTVNet +
+
+
+
+
+ + ☆ Style Generation: Image Synthesis based on Coarsely Matched Texts + + +
+ Previous text-to-image synthesis algorithms typically use explicit textual +instructions to generate/manipulate images accurately, but they have difficulty +adapting to guidance in the form of coarsely matched texts. In this work, we +attempt to stylize an input image using such coarsely matched text as guidance. +To tackle this new problem, we introduce a novel task called text-based style +generation and propose a two-stage generative adversarial network: the first +stage generates the overall image style with a sentence feature, and the second +stage refines the generated style with a synthetic feature, which is produced +by a multi-modality style synthesis module. We re-filter one existing dataset +and collect a new dataset for the task. Extensive experiments and ablation +studies are conducted to validate our framework. The practical potential of our +work is demonstrated by various applications such as text-image alignment and +story visualization. Our datasets are published at +https://www.kaggle.com/datasets/mengyaocui/style-generation. + +
+
+
+
+
+ + ☆ Poster: Making Edge-assisted LiDAR Perceptions Robust to Lossy Point + Cloud Compression + + +
+ Real-time light detection and ranging (LiDAR) perceptions, e.g., 3D object +detection and simultaneous localization and mapping are computationally +intensive to mobile devices of limited resources and often offloaded on the +edge. Offloading LiDAR perceptions requires compressing the raw sensor data, +and lossy compression is used for efficiently reducing the data volume. Lossy +compression degrades the quality of LiDAR point clouds, and the perception +performance is decreased consequently. In this work, we present an +interpolation algorithm improving the quality of a LiDAR point cloud to +mitigate the perception performance loss due to lossy compression. The +algorithm targets the range image (RI) representation of a point cloud and +interpolates points at the RI based on depth gradients. Compared to existing +image interpolation algorithms, our algorithm shows a better qualitative result +when the point cloud is reconstructed from the interpolated RI. With the +preliminary results, we also describe the next steps of the current work. + +
+
+ comment: extended abstract of 2 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ Poster: Enabling Flexible Edge-assisted XR + + +
+ Extended reality (XR) is touted as the next frontier of the digital future. +XR includes all immersive technologies of augmented reality (AR), virtual +reality (VR), and mixed reality (MR). XR applications obtain the real-world +context of the user from an underlying system, and provide rich, immersive, and +interactive virtual experiences based on the user's context in real-time. XR +systems process streams of data from device sensors, and provide +functionalities including perceptions and graphics required by the +applications. These processing steps are computationally intensive, and the +challenge is that they must be performed within the strict latency requirements +of XR. This poses limitations on the possible XR experiences that can be +supported on mobile devices with limited computing resources. + In this XR context, edge computing is an effective approach to address this +problem for mobile users. The edge is located closer to the end users and +enables processing and storing data near them. In addition, the development of +high bandwidth and low latency network technologies such as 5G facilitates the +application of edge computing for latency-critical use cases [4, 11]. This work +presents an XR system for enabling flexible edge-assisted XR. + +
+
+ comment: extended abstract of 2 pages, 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ VoxBlink: A Large Scale Speaker Verification Dataset on Camera ICASSP2024 + + +
+ In this paper, we introduce a large-scale and high-quality audio-visual +speaker verification dataset, named VoxBlink. We propose an innovative and +robust automatic audio-visual data mining pipeline to curate this dataset, +which contains 1.45M utterances from 38K speakers. Due to the inherent nature +of automated data collection, introducing noisy data is inevitable. Therefore, +we also utilize a multi-modal purification to generate a cleaner version of the +VoxBlink, named VoxBlink-clean, comprising 18K identities and 1.02M utterances. +In contrast to the VoxCeleb, the VoxBlink sources from short videos of ordinary +users, and the covered scenarios can better align with real-life situations. To +our best knowledge, the VoxBlink dataset is one of the largest publicly +available speaker verification datasets. Leveraging the VoxCeleb and +VoxBlink-clean datasets together, we employ diverse speaker verification models +with multiple architectural backbones to conduct comprehensive experimentation +on the VoxCeleb test sets. Experimental results indicate a substantial +enhancement in performance-ranging from 12% to 30% relatively-across various +backbone architectures upon incorporating the VoxBlink-clean into the training +process. The details of the dataset can be found in http://voxblink.github.io + +
+
+ comment: submit to ICASSP2024 +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`